default rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
section .text code align=64


EXTERN  OPENSSL_ia32cap_P

global  sha1_multi_block

ALIGN   32
sha1_multi_block:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_sha1_multi_block:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8



        mov     rcx,QWORD[((OPENSSL_ia32cap_P+4))]
        bt      rcx,61
        jc      NEAR _shaext_shortcut
        test    ecx,268435456
        jnz     NEAR _avx_shortcut
        mov     rax,rsp

        push    rbx

        push    rbp

        lea     rsp,[((-168))+rsp]
        movaps  XMMWORD[rsp],xmm6
        movaps  XMMWORD[16+rsp],xmm7
        movaps  XMMWORD[32+rsp],xmm8
        movaps  XMMWORD[48+rsp],xmm9
        movaps  XMMWORD[(-120)+rax],xmm10
        movaps  XMMWORD[(-104)+rax],xmm11
        movaps  XMMWORD[(-88)+rax],xmm12
        movaps  XMMWORD[(-72)+rax],xmm13
        movaps  XMMWORD[(-56)+rax],xmm14
        movaps  XMMWORD[(-40)+rax],xmm15
        sub     rsp,288
        and     rsp,-256
        mov     QWORD[272+rsp],rax

$L$body:
        lea     rbp,[K_XX_XX]
        lea     rbx,[256+rsp]

$L$oop_grande:
        mov     DWORD[280+rsp],edx
        xor     edx,edx

        mov     r8,QWORD[rsi]

        mov     ecx,DWORD[8+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[rbx],ecx
        cmovle  r8,rbp

        mov     r9,QWORD[16+rsi]

        mov     ecx,DWORD[24+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[4+rbx],ecx
        cmovle  r9,rbp

        mov     r10,QWORD[32+rsi]

        mov     ecx,DWORD[40+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[8+rbx],ecx
        cmovle  r10,rbp

        mov     r11,QWORD[48+rsi]

        mov     ecx,DWORD[56+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[12+rbx],ecx
        cmovle  r11,rbp
        test    edx,edx
        jz      NEAR $L$done

        movdqu  xmm10,XMMWORD[rdi]
        lea     rax,[128+rsp]
        movdqu  xmm11,XMMWORD[32+rdi]
        movdqu  xmm12,XMMWORD[64+rdi]
        movdqu  xmm13,XMMWORD[96+rdi]
        movdqu  xmm14,XMMWORD[128+rdi]
        movdqa  xmm5,XMMWORD[96+rbp]
        movdqa  xmm15,XMMWORD[((-32))+rbp]
        jmp     NEAR $L$oop

ALIGN   32
$L$oop:
        movd    xmm0,DWORD[r8]
        lea     r8,[64+r8]
        movd    xmm2,DWORD[r9]
        lea     r9,[64+r9]
        movd    xmm3,DWORD[r10]
        lea     r10,[64+r10]
        movd    xmm4,DWORD[r11]
        lea     r11,[64+r11]
        punpckldq       xmm0,xmm3
        movd    xmm1,DWORD[((-60))+r8]
        punpckldq       xmm2,xmm4
        movd    xmm9,DWORD[((-60))+r9]
        punpckldq       xmm0,xmm2
        movd    xmm8,DWORD[((-60))+r10]
DB      102,15,56,0,197
        movd    xmm7,DWORD[((-60))+r11]
        punpckldq       xmm1,xmm8
        movdqa  xmm8,xmm10
        paddd   xmm14,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm11
        movdqa  xmm6,xmm11
        pslld   xmm8,5
        pandn   xmm7,xmm13
        pand    xmm6,xmm12
        punpckldq       xmm1,xmm9
        movdqa  xmm9,xmm10

        movdqa  XMMWORD[(0-128)+rax],xmm0
        paddd   xmm14,xmm0
        movd    xmm2,DWORD[((-56))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm11

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-56))+r9]
        pslld   xmm7,30
        paddd   xmm14,xmm6

        psrld   xmm11,2
        paddd   xmm14,xmm8
DB      102,15,56,0,205
        movd    xmm8,DWORD[((-56))+r10]
        por     xmm11,xmm7
        movd    xmm7,DWORD[((-56))+r11]
        punpckldq       xmm2,xmm8
        movdqa  xmm8,xmm14
        paddd   xmm13,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm10
        movdqa  xmm6,xmm10
        pslld   xmm8,5
        pandn   xmm7,xmm12
        pand    xmm6,xmm11
        punpckldq       xmm2,xmm9
        movdqa  xmm9,xmm14

        movdqa  XMMWORD[(16-128)+rax],xmm1
        paddd   xmm13,xmm1
        movd    xmm3,DWORD[((-52))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm10

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-52))+r9]
        pslld   xmm7,30
        paddd   xmm13,xmm6

        psrld   xmm10,2
        paddd   xmm13,xmm8
DB      102,15,56,0,213
        movd    xmm8,DWORD[((-52))+r10]
        por     xmm10,xmm7
        movd    xmm7,DWORD[((-52))+r11]
        punpckldq       xmm3,xmm8
        movdqa  xmm8,xmm13
        paddd   xmm12,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm14
        movdqa  xmm6,xmm14
        pslld   xmm8,5
        pandn   xmm7,xmm11
        pand    xmm6,xmm10
        punpckldq       xmm3,xmm9
        movdqa  xmm9,xmm13

        movdqa  XMMWORD[(32-128)+rax],xmm2
        paddd   xmm12,xmm2
        movd    xmm4,DWORD[((-48))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm14

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-48))+r9]
        pslld   xmm7,30
        paddd   xmm12,xmm6

        psrld   xmm14,2
        paddd   xmm12,xmm8
DB      102,15,56,0,221
        movd    xmm8,DWORD[((-48))+r10]
        por     xmm14,xmm7
        movd    xmm7,DWORD[((-48))+r11]
        punpckldq       xmm4,xmm8
        movdqa  xmm8,xmm12
        paddd   xmm11,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm13
        movdqa  xmm6,xmm13
        pslld   xmm8,5
        pandn   xmm7,xmm10
        pand    xmm6,xmm14
        punpckldq       xmm4,xmm9
        movdqa  xmm9,xmm12

        movdqa  XMMWORD[(48-128)+rax],xmm3
        paddd   xmm11,xmm3
        movd    xmm0,DWORD[((-44))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm13

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-44))+r9]
        pslld   xmm7,30
        paddd   xmm11,xmm6

        psrld   xmm13,2
        paddd   xmm11,xmm8
DB      102,15,56,0,229
        movd    xmm8,DWORD[((-44))+r10]
        por     xmm13,xmm7
        movd    xmm7,DWORD[((-44))+r11]
        punpckldq       xmm0,xmm8
        movdqa  xmm8,xmm11
        paddd   xmm10,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm12
        movdqa  xmm6,xmm12
        pslld   xmm8,5
        pandn   xmm7,xmm14
        pand    xmm6,xmm13
        punpckldq       xmm0,xmm9
        movdqa  xmm9,xmm11

        movdqa  XMMWORD[(64-128)+rax],xmm4
        paddd   xmm10,xmm4
        movd    xmm1,DWORD[((-40))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm12

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-40))+r9]
        pslld   xmm7,30
        paddd   xmm10,xmm6

        psrld   xmm12,2
        paddd   xmm10,xmm8
DB      102,15,56,0,197
        movd    xmm8,DWORD[((-40))+r10]
        por     xmm12,xmm7
        movd    xmm7,DWORD[((-40))+r11]
        punpckldq       xmm1,xmm8
        movdqa  xmm8,xmm10
        paddd   xmm14,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm11
        movdqa  xmm6,xmm11
        pslld   xmm8,5
        pandn   xmm7,xmm13
        pand    xmm6,xmm12
        punpckldq       xmm1,xmm9
        movdqa  xmm9,xmm10

        movdqa  XMMWORD[(80-128)+rax],xmm0
        paddd   xmm14,xmm0
        movd    xmm2,DWORD[((-36))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm11

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-36))+r9]
        pslld   xmm7,30
        paddd   xmm14,xmm6

        psrld   xmm11,2
        paddd   xmm14,xmm8
DB      102,15,56,0,205
        movd    xmm8,DWORD[((-36))+r10]
        por     xmm11,xmm7
        movd    xmm7,DWORD[((-36))+r11]
        punpckldq       xmm2,xmm8
        movdqa  xmm8,xmm14
        paddd   xmm13,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm10
        movdqa  xmm6,xmm10
        pslld   xmm8,5
        pandn   xmm7,xmm12
        pand    xmm6,xmm11
        punpckldq       xmm2,xmm9
        movdqa  xmm9,xmm14

        movdqa  XMMWORD[(96-128)+rax],xmm1
        paddd   xmm13,xmm1
        movd    xmm3,DWORD[((-32))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm10

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-32))+r9]
        pslld   xmm7,30
        paddd   xmm13,xmm6

        psrld   xmm10,2
        paddd   xmm13,xmm8
DB      102,15,56,0,213
        movd    xmm8,DWORD[((-32))+r10]
        por     xmm10,xmm7
        movd    xmm7,DWORD[((-32))+r11]
        punpckldq       xmm3,xmm8
        movdqa  xmm8,xmm13
        paddd   xmm12,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm14
        movdqa  xmm6,xmm14
        pslld   xmm8,5
        pandn   xmm7,xmm11
        pand    xmm6,xmm10
        punpckldq       xmm3,xmm9
        movdqa  xmm9,xmm13

        movdqa  XMMWORD[(112-128)+rax],xmm2
        paddd   xmm12,xmm2
        movd    xmm4,DWORD[((-28))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm14

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-28))+r9]
        pslld   xmm7,30
        paddd   xmm12,xmm6

        psrld   xmm14,2
        paddd   xmm12,xmm8
DB      102,15,56,0,221
        movd    xmm8,DWORD[((-28))+r10]
        por     xmm14,xmm7
        movd    xmm7,DWORD[((-28))+r11]
        punpckldq       xmm4,xmm8
        movdqa  xmm8,xmm12
        paddd   xmm11,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm13
        movdqa  xmm6,xmm13
        pslld   xmm8,5
        pandn   xmm7,xmm10
        pand    xmm6,xmm14
        punpckldq       xmm4,xmm9
        movdqa  xmm9,xmm12

        movdqa  XMMWORD[(128-128)+rax],xmm3
        paddd   xmm11,xmm3
        movd    xmm0,DWORD[((-24))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm13

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-24))+r9]
        pslld   xmm7,30
        paddd   xmm11,xmm6

        psrld   xmm13,2
        paddd   xmm11,xmm8
DB      102,15,56,0,229
        movd    xmm8,DWORD[((-24))+r10]
        por     xmm13,xmm7
        movd    xmm7,DWORD[((-24))+r11]
        punpckldq       xmm0,xmm8
        movdqa  xmm8,xmm11
        paddd   xmm10,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm12
        movdqa  xmm6,xmm12
        pslld   xmm8,5
        pandn   xmm7,xmm14
        pand    xmm6,xmm13
        punpckldq       xmm0,xmm9
        movdqa  xmm9,xmm11

        movdqa  XMMWORD[(144-128)+rax],xmm4
        paddd   xmm10,xmm4
        movd    xmm1,DWORD[((-20))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm12

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-20))+r9]
        pslld   xmm7,30
        paddd   xmm10,xmm6

        psrld   xmm12,2
        paddd   xmm10,xmm8
DB      102,15,56,0,197
        movd    xmm8,DWORD[((-20))+r10]
        por     xmm12,xmm7
        movd    xmm7,DWORD[((-20))+r11]
        punpckldq       xmm1,xmm8
        movdqa  xmm8,xmm10
        paddd   xmm14,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm11
        movdqa  xmm6,xmm11
        pslld   xmm8,5
        pandn   xmm7,xmm13
        pand    xmm6,xmm12
        punpckldq       xmm1,xmm9
        movdqa  xmm9,xmm10

        movdqa  XMMWORD[(160-128)+rax],xmm0
        paddd   xmm14,xmm0
        movd    xmm2,DWORD[((-16))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm11

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-16))+r9]
        pslld   xmm7,30
        paddd   xmm14,xmm6

        psrld   xmm11,2
        paddd   xmm14,xmm8
DB      102,15,56,0,205
        movd    xmm8,DWORD[((-16))+r10]
        por     xmm11,xmm7
        movd    xmm7,DWORD[((-16))+r11]
        punpckldq       xmm2,xmm8
        movdqa  xmm8,xmm14
        paddd   xmm13,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm10
        movdqa  xmm6,xmm10
        pslld   xmm8,5
        pandn   xmm7,xmm12
        pand    xmm6,xmm11
        punpckldq       xmm2,xmm9
        movdqa  xmm9,xmm14

        movdqa  XMMWORD[(176-128)+rax],xmm1
        paddd   xmm13,xmm1
        movd    xmm3,DWORD[((-12))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm10

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-12))+r9]
        pslld   xmm7,30
        paddd   xmm13,xmm6

        psrld   xmm10,2
        paddd   xmm13,xmm8
DB      102,15,56,0,213
        movd    xmm8,DWORD[((-12))+r10]
        por     xmm10,xmm7
        movd    xmm7,DWORD[((-12))+r11]
        punpckldq       xmm3,xmm8
        movdqa  xmm8,xmm13
        paddd   xmm12,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm14
        movdqa  xmm6,xmm14
        pslld   xmm8,5
        pandn   xmm7,xmm11
        pand    xmm6,xmm10
        punpckldq       xmm3,xmm9
        movdqa  xmm9,xmm13

        movdqa  XMMWORD[(192-128)+rax],xmm2
        paddd   xmm12,xmm2
        movd    xmm4,DWORD[((-8))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm14

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-8))+r9]
        pslld   xmm7,30
        paddd   xmm12,xmm6

        psrld   xmm14,2
        paddd   xmm12,xmm8
DB      102,15,56,0,221
        movd    xmm8,DWORD[((-8))+r10]
        por     xmm14,xmm7
        movd    xmm7,DWORD[((-8))+r11]
        punpckldq       xmm4,xmm8
        movdqa  xmm8,xmm12
        paddd   xmm11,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm13
        movdqa  xmm6,xmm13
        pslld   xmm8,5
        pandn   xmm7,xmm10
        pand    xmm6,xmm14
        punpckldq       xmm4,xmm9
        movdqa  xmm9,xmm12

        movdqa  XMMWORD[(208-128)+rax],xmm3
        paddd   xmm11,xmm3
        movd    xmm0,DWORD[((-4))+r8]
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm13

        por     xmm8,xmm9
        movd    xmm9,DWORD[((-4))+r9]
        pslld   xmm7,30
        paddd   xmm11,xmm6

        psrld   xmm13,2
        paddd   xmm11,xmm8
DB      102,15,56,0,229
        movd    xmm8,DWORD[((-4))+r10]
        por     xmm13,xmm7
        movdqa  xmm1,XMMWORD[((0-128))+rax]
        movd    xmm7,DWORD[((-4))+r11]
        punpckldq       xmm0,xmm8
        movdqa  xmm8,xmm11
        paddd   xmm10,xmm15
        punpckldq       xmm9,xmm7
        movdqa  xmm7,xmm12
        movdqa  xmm6,xmm12
        pslld   xmm8,5
        prefetcht0      [63+r8]
        pandn   xmm7,xmm14
        pand    xmm6,xmm13
        punpckldq       xmm0,xmm9
        movdqa  xmm9,xmm11

        movdqa  XMMWORD[(224-128)+rax],xmm4
        paddd   xmm10,xmm4
        psrld   xmm9,27
        pxor    xmm6,xmm7
        movdqa  xmm7,xmm12
        prefetcht0      [63+r9]

        por     xmm8,xmm9
        pslld   xmm7,30
        paddd   xmm10,xmm6
        prefetcht0      [63+r10]

        psrld   xmm12,2
        paddd   xmm10,xmm8
DB      102,15,56,0,197
        prefetcht0      [63+r11]
        por     xmm12,xmm7
        movdqa  xmm2,XMMWORD[((16-128))+rax]
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((32-128))+rax]

        movdqa  xmm8,xmm10
        pxor    xmm1,XMMWORD[((128-128))+rax]
        paddd   xmm14,xmm15
        movdqa  xmm7,xmm11
        pslld   xmm8,5
        pxor    xmm1,xmm3
        movdqa  xmm6,xmm11
        pandn   xmm7,xmm13
        movdqa  xmm5,xmm1
        pand    xmm6,xmm12
        movdqa  xmm9,xmm10
        psrld   xmm5,31
        paddd   xmm1,xmm1

        movdqa  XMMWORD[(240-128)+rax],xmm0
        paddd   xmm14,xmm0
        psrld   xmm9,27
        pxor    xmm6,xmm7

        movdqa  xmm7,xmm11
        por     xmm8,xmm9
        pslld   xmm7,30
        paddd   xmm14,xmm6

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((48-128))+rax]

        movdqa  xmm8,xmm14
        pxor    xmm2,XMMWORD[((144-128))+rax]
        paddd   xmm13,xmm15
        movdqa  xmm7,xmm10
        pslld   xmm8,5
        pxor    xmm2,xmm4
        movdqa  xmm6,xmm10
        pandn   xmm7,xmm12
        movdqa  xmm5,xmm2
        pand    xmm6,xmm11
        movdqa  xmm9,xmm14
        psrld   xmm5,31
        paddd   xmm2,xmm2

        movdqa  XMMWORD[(0-128)+rax],xmm1
        paddd   xmm13,xmm1
        psrld   xmm9,27
        pxor    xmm6,xmm7

        movdqa  xmm7,xmm10
        por     xmm8,xmm9
        pslld   xmm7,30
        paddd   xmm13,xmm6

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((64-128))+rax]

        movdqa  xmm8,xmm13
        pxor    xmm3,XMMWORD[((160-128))+rax]
        paddd   xmm12,xmm15
        movdqa  xmm7,xmm14
        pslld   xmm8,5
        pxor    xmm3,xmm0
        movdqa  xmm6,xmm14
        pandn   xmm7,xmm11
        movdqa  xmm5,xmm3
        pand    xmm6,xmm10
        movdqa  xmm9,xmm13
        psrld   xmm5,31
        paddd   xmm3,xmm3

        movdqa  XMMWORD[(16-128)+rax],xmm2
        paddd   xmm12,xmm2
        psrld   xmm9,27
        pxor    xmm6,xmm7

        movdqa  xmm7,xmm14
        por     xmm8,xmm9
        pslld   xmm7,30
        paddd   xmm12,xmm6

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((80-128))+rax]

        movdqa  xmm8,xmm12
        pxor    xmm4,XMMWORD[((176-128))+rax]
        paddd   xmm11,xmm15
        movdqa  xmm7,xmm13
        pslld   xmm8,5
        pxor    xmm4,xmm1
        movdqa  xmm6,xmm13
        pandn   xmm7,xmm10
        movdqa  xmm5,xmm4
        pand    xmm6,xmm14
        movdqa  xmm9,xmm12
        psrld   xmm5,31
        paddd   xmm4,xmm4

        movdqa  XMMWORD[(32-128)+rax],xmm3
        paddd   xmm11,xmm3
        psrld   xmm9,27
        pxor    xmm6,xmm7

        movdqa  xmm7,xmm13
        por     xmm8,xmm9
        pslld   xmm7,30
        paddd   xmm11,xmm6

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((96-128))+rax]

        movdqa  xmm8,xmm11
        pxor    xmm0,XMMWORD[((192-128))+rax]
        paddd   xmm10,xmm15
        movdqa  xmm7,xmm12
        pslld   xmm8,5
        pxor    xmm0,xmm2
        movdqa  xmm6,xmm12
        pandn   xmm7,xmm14
        movdqa  xmm5,xmm0
        pand    xmm6,xmm13
        movdqa  xmm9,xmm11
        psrld   xmm5,31
        paddd   xmm0,xmm0

        movdqa  XMMWORD[(48-128)+rax],xmm4
        paddd   xmm10,xmm4
        psrld   xmm9,27
        pxor    xmm6,xmm7

        movdqa  xmm7,xmm12
        por     xmm8,xmm9
        pslld   xmm7,30
        paddd   xmm10,xmm6

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        movdqa  xmm15,XMMWORD[rbp]
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((112-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm6,xmm13
        pxor    xmm1,XMMWORD[((208-128))+rax]
        paddd   xmm14,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm11

        movdqa  xmm9,xmm10
        movdqa  XMMWORD[(64-128)+rax],xmm0
        paddd   xmm14,xmm0
        pxor    xmm1,xmm3
        psrld   xmm9,27
        pxor    xmm6,xmm12
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        movdqa  xmm5,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm14,xmm6
        paddd   xmm1,xmm1

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((128-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm6,xmm12
        pxor    xmm2,XMMWORD[((224-128))+rax]
        paddd   xmm13,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm10

        movdqa  xmm9,xmm14
        movdqa  XMMWORD[(80-128)+rax],xmm1
        paddd   xmm13,xmm1
        pxor    xmm2,xmm4
        psrld   xmm9,27
        pxor    xmm6,xmm11
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        movdqa  xmm5,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm13,xmm6
        paddd   xmm2,xmm2

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((144-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm6,xmm11
        pxor    xmm3,XMMWORD[((240-128))+rax]
        paddd   xmm12,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm14

        movdqa  xmm9,xmm13
        movdqa  XMMWORD[(96-128)+rax],xmm2
        paddd   xmm12,xmm2
        pxor    xmm3,xmm0
        psrld   xmm9,27
        pxor    xmm6,xmm10
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        movdqa  xmm5,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm12,xmm6
        paddd   xmm3,xmm3

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((160-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm6,xmm10
        pxor    xmm4,XMMWORD[((0-128))+rax]
        paddd   xmm11,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm13

        movdqa  xmm9,xmm12
        movdqa  XMMWORD[(112-128)+rax],xmm3
        paddd   xmm11,xmm3
        pxor    xmm4,xmm1
        psrld   xmm9,27
        pxor    xmm6,xmm14
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        movdqa  xmm5,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm11,xmm6
        paddd   xmm4,xmm4

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((176-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm6,xmm14
        pxor    xmm0,XMMWORD[((16-128))+rax]
        paddd   xmm10,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm12

        movdqa  xmm9,xmm11
        movdqa  XMMWORD[(128-128)+rax],xmm4
        paddd   xmm10,xmm4
        pxor    xmm0,xmm2
        psrld   xmm9,27
        pxor    xmm6,xmm13
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        movdqa  xmm5,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm10,xmm6
        paddd   xmm0,xmm0

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((192-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm6,xmm13
        pxor    xmm1,XMMWORD[((32-128))+rax]
        paddd   xmm14,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm11

        movdqa  xmm9,xmm10
        movdqa  XMMWORD[(144-128)+rax],xmm0
        paddd   xmm14,xmm0
        pxor    xmm1,xmm3
        psrld   xmm9,27
        pxor    xmm6,xmm12
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        movdqa  xmm5,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm14,xmm6
        paddd   xmm1,xmm1

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((208-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm6,xmm12
        pxor    xmm2,XMMWORD[((48-128))+rax]
        paddd   xmm13,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm10

        movdqa  xmm9,xmm14
        movdqa  XMMWORD[(160-128)+rax],xmm1
        paddd   xmm13,xmm1
        pxor    xmm2,xmm4
        psrld   xmm9,27
        pxor    xmm6,xmm11
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        movdqa  xmm5,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm13,xmm6
        paddd   xmm2,xmm2

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((224-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm6,xmm11
        pxor    xmm3,XMMWORD[((64-128))+rax]
        paddd   xmm12,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm14

        movdqa  xmm9,xmm13
        movdqa  XMMWORD[(176-128)+rax],xmm2
        paddd   xmm12,xmm2
        pxor    xmm3,xmm0
        psrld   xmm9,27
        pxor    xmm6,xmm10
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        movdqa  xmm5,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm12,xmm6
        paddd   xmm3,xmm3

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((240-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm6,xmm10
        pxor    xmm4,XMMWORD[((80-128))+rax]
        paddd   xmm11,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm13

        movdqa  xmm9,xmm12
        movdqa  XMMWORD[(192-128)+rax],xmm3
        paddd   xmm11,xmm3
        pxor    xmm4,xmm1
        psrld   xmm9,27
        pxor    xmm6,xmm14
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        movdqa  xmm5,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm11,xmm6
        paddd   xmm4,xmm4

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((0-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm6,xmm14
        pxor    xmm0,XMMWORD[((96-128))+rax]
        paddd   xmm10,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm12

        movdqa  xmm9,xmm11
        movdqa  XMMWORD[(208-128)+rax],xmm4
        paddd   xmm10,xmm4
        pxor    xmm0,xmm2
        psrld   xmm9,27
        pxor    xmm6,xmm13
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        movdqa  xmm5,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm10,xmm6
        paddd   xmm0,xmm0

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((16-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm6,xmm13
        pxor    xmm1,XMMWORD[((112-128))+rax]
        paddd   xmm14,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm11

        movdqa  xmm9,xmm10
        movdqa  XMMWORD[(224-128)+rax],xmm0
        paddd   xmm14,xmm0
        pxor    xmm1,xmm3
        psrld   xmm9,27
        pxor    xmm6,xmm12
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        movdqa  xmm5,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm14,xmm6
        paddd   xmm1,xmm1

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((32-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm6,xmm12
        pxor    xmm2,XMMWORD[((128-128))+rax]
        paddd   xmm13,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm10

        movdqa  xmm9,xmm14
        movdqa  XMMWORD[(240-128)+rax],xmm1
        paddd   xmm13,xmm1
        pxor    xmm2,xmm4
        psrld   xmm9,27
        pxor    xmm6,xmm11
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        movdqa  xmm5,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm13,xmm6
        paddd   xmm2,xmm2

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((48-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm6,xmm11
        pxor    xmm3,XMMWORD[((144-128))+rax]
        paddd   xmm12,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm14

        movdqa  xmm9,xmm13
        movdqa  XMMWORD[(0-128)+rax],xmm2
        paddd   xmm12,xmm2
        pxor    xmm3,xmm0
        psrld   xmm9,27
        pxor    xmm6,xmm10
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        movdqa  xmm5,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm12,xmm6
        paddd   xmm3,xmm3

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((64-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm6,xmm10
        pxor    xmm4,XMMWORD[((160-128))+rax]
        paddd   xmm11,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm13

        movdqa  xmm9,xmm12
        movdqa  XMMWORD[(16-128)+rax],xmm3
        paddd   xmm11,xmm3
        pxor    xmm4,xmm1
        psrld   xmm9,27
        pxor    xmm6,xmm14
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        movdqa  xmm5,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm11,xmm6
        paddd   xmm4,xmm4

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((80-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm6,xmm14
        pxor    xmm0,XMMWORD[((176-128))+rax]
        paddd   xmm10,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm12

        movdqa  xmm9,xmm11
        movdqa  XMMWORD[(32-128)+rax],xmm4
        paddd   xmm10,xmm4
        pxor    xmm0,xmm2
        psrld   xmm9,27
        pxor    xmm6,xmm13
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        movdqa  xmm5,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm10,xmm6
        paddd   xmm0,xmm0

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((96-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm6,xmm13
        pxor    xmm1,XMMWORD[((192-128))+rax]
        paddd   xmm14,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm11

        movdqa  xmm9,xmm10
        movdqa  XMMWORD[(48-128)+rax],xmm0
        paddd   xmm14,xmm0
        pxor    xmm1,xmm3
        psrld   xmm9,27
        pxor    xmm6,xmm12
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        movdqa  xmm5,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm14,xmm6
        paddd   xmm1,xmm1

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((112-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm6,xmm12
        pxor    xmm2,XMMWORD[((208-128))+rax]
        paddd   xmm13,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm10

        movdqa  xmm9,xmm14
        movdqa  XMMWORD[(64-128)+rax],xmm1
        paddd   xmm13,xmm1
        pxor    xmm2,xmm4
        psrld   xmm9,27
        pxor    xmm6,xmm11
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        movdqa  xmm5,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm13,xmm6
        paddd   xmm2,xmm2

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((128-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm6,xmm11
        pxor    xmm3,XMMWORD[((224-128))+rax]
        paddd   xmm12,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm14

        movdqa  xmm9,xmm13
        movdqa  XMMWORD[(80-128)+rax],xmm2
        paddd   xmm12,xmm2
        pxor    xmm3,xmm0
        psrld   xmm9,27
        pxor    xmm6,xmm10
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        movdqa  xmm5,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm12,xmm6
        paddd   xmm3,xmm3

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((144-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm6,xmm10
        pxor    xmm4,XMMWORD[((240-128))+rax]
        paddd   xmm11,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm13

        movdqa  xmm9,xmm12
        movdqa  XMMWORD[(96-128)+rax],xmm3
        paddd   xmm11,xmm3
        pxor    xmm4,xmm1
        psrld   xmm9,27
        pxor    xmm6,xmm14
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        movdqa  xmm5,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm11,xmm6
        paddd   xmm4,xmm4

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((160-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm6,xmm14
        pxor    xmm0,XMMWORD[((0-128))+rax]
        paddd   xmm10,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm12

        movdqa  xmm9,xmm11
        movdqa  XMMWORD[(112-128)+rax],xmm4
        paddd   xmm10,xmm4
        pxor    xmm0,xmm2
        psrld   xmm9,27
        pxor    xmm6,xmm13
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        movdqa  xmm5,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm10,xmm6
        paddd   xmm0,xmm0

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        movdqa  xmm15,XMMWORD[32+rbp]
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((176-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm7,xmm13
        pxor    xmm1,XMMWORD[((16-128))+rax]
        pxor    xmm1,xmm3
        paddd   xmm14,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm10
        pand    xmm7,xmm12

        movdqa  xmm6,xmm13
        movdqa  xmm5,xmm1
        psrld   xmm9,27
        paddd   xmm14,xmm7
        pxor    xmm6,xmm12

        movdqa  XMMWORD[(128-128)+rax],xmm0
        paddd   xmm14,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm11
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        paddd   xmm1,xmm1
        paddd   xmm14,xmm6

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((192-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm7,xmm12
        pxor    xmm2,XMMWORD[((32-128))+rax]
        pxor    xmm2,xmm4
        paddd   xmm13,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm14
        pand    xmm7,xmm11

        movdqa  xmm6,xmm12
        movdqa  xmm5,xmm2
        psrld   xmm9,27
        paddd   xmm13,xmm7
        pxor    xmm6,xmm11

        movdqa  XMMWORD[(144-128)+rax],xmm1
        paddd   xmm13,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm10
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        paddd   xmm2,xmm2
        paddd   xmm13,xmm6

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((208-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm7,xmm11
        pxor    xmm3,XMMWORD[((48-128))+rax]
        pxor    xmm3,xmm0
        paddd   xmm12,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm13
        pand    xmm7,xmm10

        movdqa  xmm6,xmm11
        movdqa  xmm5,xmm3
        psrld   xmm9,27
        paddd   xmm12,xmm7
        pxor    xmm6,xmm10

        movdqa  XMMWORD[(160-128)+rax],xmm2
        paddd   xmm12,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm14
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        paddd   xmm3,xmm3
        paddd   xmm12,xmm6

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((224-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm7,xmm10
        pxor    xmm4,XMMWORD[((64-128))+rax]
        pxor    xmm4,xmm1
        paddd   xmm11,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm12
        pand    xmm7,xmm14

        movdqa  xmm6,xmm10
        movdqa  xmm5,xmm4
        psrld   xmm9,27
        paddd   xmm11,xmm7
        pxor    xmm6,xmm14

        movdqa  XMMWORD[(176-128)+rax],xmm3
        paddd   xmm11,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm13
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        paddd   xmm4,xmm4
        paddd   xmm11,xmm6

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((240-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm7,xmm14
        pxor    xmm0,XMMWORD[((80-128))+rax]
        pxor    xmm0,xmm2
        paddd   xmm10,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm11
        pand    xmm7,xmm13

        movdqa  xmm6,xmm14
        movdqa  xmm5,xmm0
        psrld   xmm9,27
        paddd   xmm10,xmm7
        pxor    xmm6,xmm13

        movdqa  XMMWORD[(192-128)+rax],xmm4
        paddd   xmm10,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm12
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        paddd   xmm0,xmm0
        paddd   xmm10,xmm6

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((0-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm7,xmm13
        pxor    xmm1,XMMWORD[((96-128))+rax]
        pxor    xmm1,xmm3
        paddd   xmm14,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm10
        pand    xmm7,xmm12

        movdqa  xmm6,xmm13
        movdqa  xmm5,xmm1
        psrld   xmm9,27
        paddd   xmm14,xmm7
        pxor    xmm6,xmm12

        movdqa  XMMWORD[(208-128)+rax],xmm0
        paddd   xmm14,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm11
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        paddd   xmm1,xmm1
        paddd   xmm14,xmm6

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((16-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm7,xmm12
        pxor    xmm2,XMMWORD[((112-128))+rax]
        pxor    xmm2,xmm4
        paddd   xmm13,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm14
        pand    xmm7,xmm11

        movdqa  xmm6,xmm12
        movdqa  xmm5,xmm2
        psrld   xmm9,27
        paddd   xmm13,xmm7
        pxor    xmm6,xmm11

        movdqa  XMMWORD[(224-128)+rax],xmm1
        paddd   xmm13,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm10
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        paddd   xmm2,xmm2
        paddd   xmm13,xmm6

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((32-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm7,xmm11
        pxor    xmm3,XMMWORD[((128-128))+rax]
        pxor    xmm3,xmm0
        paddd   xmm12,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm13
        pand    xmm7,xmm10

        movdqa  xmm6,xmm11
        movdqa  xmm5,xmm3
        psrld   xmm9,27
        paddd   xmm12,xmm7
        pxor    xmm6,xmm10

        movdqa  XMMWORD[(240-128)+rax],xmm2
        paddd   xmm12,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm14
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        paddd   xmm3,xmm3
        paddd   xmm12,xmm6

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((48-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm7,xmm10
        pxor    xmm4,XMMWORD[((144-128))+rax]
        pxor    xmm4,xmm1
        paddd   xmm11,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm12
        pand    xmm7,xmm14

        movdqa  xmm6,xmm10
        movdqa  xmm5,xmm4
        psrld   xmm9,27
        paddd   xmm11,xmm7
        pxor    xmm6,xmm14

        movdqa  XMMWORD[(0-128)+rax],xmm3
        paddd   xmm11,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm13
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        paddd   xmm4,xmm4
        paddd   xmm11,xmm6

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((64-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm7,xmm14
        pxor    xmm0,XMMWORD[((160-128))+rax]
        pxor    xmm0,xmm2
        paddd   xmm10,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm11
        pand    xmm7,xmm13

        movdqa  xmm6,xmm14
        movdqa  xmm5,xmm0
        psrld   xmm9,27
        paddd   xmm10,xmm7
        pxor    xmm6,xmm13

        movdqa  XMMWORD[(16-128)+rax],xmm4
        paddd   xmm10,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm12
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        paddd   xmm0,xmm0
        paddd   xmm10,xmm6

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((80-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm7,xmm13
        pxor    xmm1,XMMWORD[((176-128))+rax]
        pxor    xmm1,xmm3
        paddd   xmm14,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm10
        pand    xmm7,xmm12

        movdqa  xmm6,xmm13
        movdqa  xmm5,xmm1
        psrld   xmm9,27
        paddd   xmm14,xmm7
        pxor    xmm6,xmm12

        movdqa  XMMWORD[(32-128)+rax],xmm0
        paddd   xmm14,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm11
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        paddd   xmm1,xmm1
        paddd   xmm14,xmm6

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((96-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm7,xmm12
        pxor    xmm2,XMMWORD[((192-128))+rax]
        pxor    xmm2,xmm4
        paddd   xmm13,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm14
        pand    xmm7,xmm11

        movdqa  xmm6,xmm12
        movdqa  xmm5,xmm2
        psrld   xmm9,27
        paddd   xmm13,xmm7
        pxor    xmm6,xmm11

        movdqa  XMMWORD[(48-128)+rax],xmm1
        paddd   xmm13,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm10
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        paddd   xmm2,xmm2
        paddd   xmm13,xmm6

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((112-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm7,xmm11
        pxor    xmm3,XMMWORD[((208-128))+rax]
        pxor    xmm3,xmm0
        paddd   xmm12,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm13
        pand    xmm7,xmm10

        movdqa  xmm6,xmm11
        movdqa  xmm5,xmm3
        psrld   xmm9,27
        paddd   xmm12,xmm7
        pxor    xmm6,xmm10

        movdqa  XMMWORD[(64-128)+rax],xmm2
        paddd   xmm12,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm14
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        paddd   xmm3,xmm3
        paddd   xmm12,xmm6

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((128-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm7,xmm10
        pxor    xmm4,XMMWORD[((224-128))+rax]
        pxor    xmm4,xmm1
        paddd   xmm11,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm12
        pand    xmm7,xmm14

        movdqa  xmm6,xmm10
        movdqa  xmm5,xmm4
        psrld   xmm9,27
        paddd   xmm11,xmm7
        pxor    xmm6,xmm14

        movdqa  XMMWORD[(80-128)+rax],xmm3
        paddd   xmm11,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm13
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        paddd   xmm4,xmm4
        paddd   xmm11,xmm6

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((144-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm7,xmm14
        pxor    xmm0,XMMWORD[((240-128))+rax]
        pxor    xmm0,xmm2
        paddd   xmm10,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm11
        pand    xmm7,xmm13

        movdqa  xmm6,xmm14
        movdqa  xmm5,xmm0
        psrld   xmm9,27
        paddd   xmm10,xmm7
        pxor    xmm6,xmm13

        movdqa  XMMWORD[(96-128)+rax],xmm4
        paddd   xmm10,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm12
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        paddd   xmm0,xmm0
        paddd   xmm10,xmm6

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((160-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm7,xmm13
        pxor    xmm1,XMMWORD[((0-128))+rax]
        pxor    xmm1,xmm3
        paddd   xmm14,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm10
        pand    xmm7,xmm12

        movdqa  xmm6,xmm13
        movdqa  xmm5,xmm1
        psrld   xmm9,27
        paddd   xmm14,xmm7
        pxor    xmm6,xmm12

        movdqa  XMMWORD[(112-128)+rax],xmm0
        paddd   xmm14,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm11
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        paddd   xmm1,xmm1
        paddd   xmm14,xmm6

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((176-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm7,xmm12
        pxor    xmm2,XMMWORD[((16-128))+rax]
        pxor    xmm2,xmm4
        paddd   xmm13,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm14
        pand    xmm7,xmm11

        movdqa  xmm6,xmm12
        movdqa  xmm5,xmm2
        psrld   xmm9,27
        paddd   xmm13,xmm7
        pxor    xmm6,xmm11

        movdqa  XMMWORD[(128-128)+rax],xmm1
        paddd   xmm13,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm10
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        paddd   xmm2,xmm2
        paddd   xmm13,xmm6

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((192-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm7,xmm11
        pxor    xmm3,XMMWORD[((32-128))+rax]
        pxor    xmm3,xmm0
        paddd   xmm12,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm13
        pand    xmm7,xmm10

        movdqa  xmm6,xmm11
        movdqa  xmm5,xmm3
        psrld   xmm9,27
        paddd   xmm12,xmm7
        pxor    xmm6,xmm10

        movdqa  XMMWORD[(144-128)+rax],xmm2
        paddd   xmm12,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm14
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        paddd   xmm3,xmm3
        paddd   xmm12,xmm6

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((208-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm7,xmm10
        pxor    xmm4,XMMWORD[((48-128))+rax]
        pxor    xmm4,xmm1
        paddd   xmm11,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm12
        pand    xmm7,xmm14

        movdqa  xmm6,xmm10
        movdqa  xmm5,xmm4
        psrld   xmm9,27
        paddd   xmm11,xmm7
        pxor    xmm6,xmm14

        movdqa  XMMWORD[(160-128)+rax],xmm3
        paddd   xmm11,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm13
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        paddd   xmm4,xmm4
        paddd   xmm11,xmm6

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((224-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm7,xmm14
        pxor    xmm0,XMMWORD[((64-128))+rax]
        pxor    xmm0,xmm2
        paddd   xmm10,xmm15
        pslld   xmm8,5
        movdqa  xmm9,xmm11
        pand    xmm7,xmm13

        movdqa  xmm6,xmm14
        movdqa  xmm5,xmm0
        psrld   xmm9,27
        paddd   xmm10,xmm7
        pxor    xmm6,xmm13

        movdqa  XMMWORD[(176-128)+rax],xmm4
        paddd   xmm10,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        pand    xmm6,xmm12
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        paddd   xmm0,xmm0
        paddd   xmm10,xmm6

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        movdqa  xmm15,XMMWORD[64+rbp]
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((240-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm6,xmm13
        pxor    xmm1,XMMWORD[((80-128))+rax]
        paddd   xmm14,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm11

        movdqa  xmm9,xmm10
        movdqa  XMMWORD[(192-128)+rax],xmm0
        paddd   xmm14,xmm0
        pxor    xmm1,xmm3
        psrld   xmm9,27
        pxor    xmm6,xmm12
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        movdqa  xmm5,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm14,xmm6
        paddd   xmm1,xmm1

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((0-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm6,xmm12
        pxor    xmm2,XMMWORD[((96-128))+rax]
        paddd   xmm13,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm10

        movdqa  xmm9,xmm14
        movdqa  XMMWORD[(208-128)+rax],xmm1
        paddd   xmm13,xmm1
        pxor    xmm2,xmm4
        psrld   xmm9,27
        pxor    xmm6,xmm11
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        movdqa  xmm5,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm13,xmm6
        paddd   xmm2,xmm2

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((16-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm6,xmm11
        pxor    xmm3,XMMWORD[((112-128))+rax]
        paddd   xmm12,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm14

        movdqa  xmm9,xmm13
        movdqa  XMMWORD[(224-128)+rax],xmm2
        paddd   xmm12,xmm2
        pxor    xmm3,xmm0
        psrld   xmm9,27
        pxor    xmm6,xmm10
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        movdqa  xmm5,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm12,xmm6
        paddd   xmm3,xmm3

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((32-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm6,xmm10
        pxor    xmm4,XMMWORD[((128-128))+rax]
        paddd   xmm11,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm13

        movdqa  xmm9,xmm12
        movdqa  XMMWORD[(240-128)+rax],xmm3
        paddd   xmm11,xmm3
        pxor    xmm4,xmm1
        psrld   xmm9,27
        pxor    xmm6,xmm14
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        movdqa  xmm5,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm11,xmm6
        paddd   xmm4,xmm4

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((48-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm6,xmm14
        pxor    xmm0,XMMWORD[((144-128))+rax]
        paddd   xmm10,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm12

        movdqa  xmm9,xmm11
        movdqa  XMMWORD[(0-128)+rax],xmm4
        paddd   xmm10,xmm4
        pxor    xmm0,xmm2
        psrld   xmm9,27
        pxor    xmm6,xmm13
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        movdqa  xmm5,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm10,xmm6
        paddd   xmm0,xmm0

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((64-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm6,xmm13
        pxor    xmm1,XMMWORD[((160-128))+rax]
        paddd   xmm14,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm11

        movdqa  xmm9,xmm10
        movdqa  XMMWORD[(16-128)+rax],xmm0
        paddd   xmm14,xmm0
        pxor    xmm1,xmm3
        psrld   xmm9,27
        pxor    xmm6,xmm12
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        movdqa  xmm5,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm14,xmm6
        paddd   xmm1,xmm1

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((80-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm6,xmm12
        pxor    xmm2,XMMWORD[((176-128))+rax]
        paddd   xmm13,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm10

        movdqa  xmm9,xmm14
        movdqa  XMMWORD[(32-128)+rax],xmm1
        paddd   xmm13,xmm1
        pxor    xmm2,xmm4
        psrld   xmm9,27
        pxor    xmm6,xmm11
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        movdqa  xmm5,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm13,xmm6
        paddd   xmm2,xmm2

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((96-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm6,xmm11
        pxor    xmm3,XMMWORD[((192-128))+rax]
        paddd   xmm12,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm14

        movdqa  xmm9,xmm13
        movdqa  XMMWORD[(48-128)+rax],xmm2
        paddd   xmm12,xmm2
        pxor    xmm3,xmm0
        psrld   xmm9,27
        pxor    xmm6,xmm10
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        movdqa  xmm5,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm12,xmm6
        paddd   xmm3,xmm3

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((112-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm6,xmm10
        pxor    xmm4,XMMWORD[((208-128))+rax]
        paddd   xmm11,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm13

        movdqa  xmm9,xmm12
        movdqa  XMMWORD[(64-128)+rax],xmm3
        paddd   xmm11,xmm3
        pxor    xmm4,xmm1
        psrld   xmm9,27
        pxor    xmm6,xmm14
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        movdqa  xmm5,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm11,xmm6
        paddd   xmm4,xmm4

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((128-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm6,xmm14
        pxor    xmm0,XMMWORD[((224-128))+rax]
        paddd   xmm10,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm12

        movdqa  xmm9,xmm11
        movdqa  XMMWORD[(80-128)+rax],xmm4
        paddd   xmm10,xmm4
        pxor    xmm0,xmm2
        psrld   xmm9,27
        pxor    xmm6,xmm13
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        movdqa  xmm5,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm10,xmm6
        paddd   xmm0,xmm0

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((144-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm6,xmm13
        pxor    xmm1,XMMWORD[((240-128))+rax]
        paddd   xmm14,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm11

        movdqa  xmm9,xmm10
        movdqa  XMMWORD[(96-128)+rax],xmm0
        paddd   xmm14,xmm0
        pxor    xmm1,xmm3
        psrld   xmm9,27
        pxor    xmm6,xmm12
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        movdqa  xmm5,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm14,xmm6
        paddd   xmm1,xmm1

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((160-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm6,xmm12
        pxor    xmm2,XMMWORD[((0-128))+rax]
        paddd   xmm13,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm10

        movdqa  xmm9,xmm14
        movdqa  XMMWORD[(112-128)+rax],xmm1
        paddd   xmm13,xmm1
        pxor    xmm2,xmm4
        psrld   xmm9,27
        pxor    xmm6,xmm11
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        movdqa  xmm5,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm13,xmm6
        paddd   xmm2,xmm2

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((176-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm6,xmm11
        pxor    xmm3,XMMWORD[((16-128))+rax]
        paddd   xmm12,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm14

        movdqa  xmm9,xmm13
        paddd   xmm12,xmm2
        pxor    xmm3,xmm0
        psrld   xmm9,27
        pxor    xmm6,xmm10
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        movdqa  xmm5,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm12,xmm6
        paddd   xmm3,xmm3

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((192-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm6,xmm10
        pxor    xmm4,XMMWORD[((32-128))+rax]
        paddd   xmm11,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm13

        movdqa  xmm9,xmm12
        paddd   xmm11,xmm3
        pxor    xmm4,xmm1
        psrld   xmm9,27
        pxor    xmm6,xmm14
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        movdqa  xmm5,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm11,xmm6
        paddd   xmm4,xmm4

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        pxor    xmm0,xmm2
        movdqa  xmm2,XMMWORD[((208-128))+rax]

        movdqa  xmm8,xmm11
        movdqa  xmm6,xmm14
        pxor    xmm0,XMMWORD[((48-128))+rax]
        paddd   xmm10,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm12

        movdqa  xmm9,xmm11
        paddd   xmm10,xmm4
        pxor    xmm0,xmm2
        psrld   xmm9,27
        pxor    xmm6,xmm13
        movdqa  xmm7,xmm12

        pslld   xmm7,30
        movdqa  xmm5,xmm0
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm10,xmm6
        paddd   xmm0,xmm0

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm0,xmm5
        por     xmm12,xmm7
        pxor    xmm1,xmm3
        movdqa  xmm3,XMMWORD[((224-128))+rax]

        movdqa  xmm8,xmm10
        movdqa  xmm6,xmm13
        pxor    xmm1,XMMWORD[((64-128))+rax]
        paddd   xmm14,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm11

        movdqa  xmm9,xmm10
        paddd   xmm14,xmm0
        pxor    xmm1,xmm3
        psrld   xmm9,27
        pxor    xmm6,xmm12
        movdqa  xmm7,xmm11

        pslld   xmm7,30
        movdqa  xmm5,xmm1
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm14,xmm6
        paddd   xmm1,xmm1

        psrld   xmm11,2
        paddd   xmm14,xmm8
        por     xmm1,xmm5
        por     xmm11,xmm7
        pxor    xmm2,xmm4
        movdqa  xmm4,XMMWORD[((240-128))+rax]

        movdqa  xmm8,xmm14
        movdqa  xmm6,xmm12
        pxor    xmm2,XMMWORD[((80-128))+rax]
        paddd   xmm13,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm10

        movdqa  xmm9,xmm14
        paddd   xmm13,xmm1
        pxor    xmm2,xmm4
        psrld   xmm9,27
        pxor    xmm6,xmm11
        movdqa  xmm7,xmm10

        pslld   xmm7,30
        movdqa  xmm5,xmm2
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm13,xmm6
        paddd   xmm2,xmm2

        psrld   xmm10,2
        paddd   xmm13,xmm8
        por     xmm2,xmm5
        por     xmm10,xmm7
        pxor    xmm3,xmm0
        movdqa  xmm0,XMMWORD[((0-128))+rax]

        movdqa  xmm8,xmm13
        movdqa  xmm6,xmm11
        pxor    xmm3,XMMWORD[((96-128))+rax]
        paddd   xmm12,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm14

        movdqa  xmm9,xmm13
        paddd   xmm12,xmm2
        pxor    xmm3,xmm0
        psrld   xmm9,27
        pxor    xmm6,xmm10
        movdqa  xmm7,xmm14

        pslld   xmm7,30
        movdqa  xmm5,xmm3
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm12,xmm6
        paddd   xmm3,xmm3

        psrld   xmm14,2
        paddd   xmm12,xmm8
        por     xmm3,xmm5
        por     xmm14,xmm7
        pxor    xmm4,xmm1
        movdqa  xmm1,XMMWORD[((16-128))+rax]

        movdqa  xmm8,xmm12
        movdqa  xmm6,xmm10
        pxor    xmm4,XMMWORD[((112-128))+rax]
        paddd   xmm11,xmm15
        pslld   xmm8,5
        pxor    xmm6,xmm13

        movdqa  xmm9,xmm12
        paddd   xmm11,xmm3
        pxor    xmm4,xmm1
        psrld   xmm9,27
        pxor    xmm6,xmm14
        movdqa  xmm7,xmm13

        pslld   xmm7,30
        movdqa  xmm5,xmm4
        por     xmm8,xmm9
        psrld   xmm5,31
        paddd   xmm11,xmm6
        paddd   xmm4,xmm4

        psrld   xmm13,2
        paddd   xmm11,xmm8
        por     xmm4,xmm5
        por     xmm13,xmm7
        movdqa  xmm8,xmm11
        paddd   xmm10,xmm15
        movdqa  xmm6,xmm14
        pslld   xmm8,5
        pxor    xmm6,xmm12

        movdqa  xmm9,xmm11
        paddd   xmm10,xmm4
        psrld   xmm9,27
        movdqa  xmm7,xmm12
        pxor    xmm6,xmm13

        pslld   xmm7,30
        por     xmm8,xmm9
        paddd   xmm10,xmm6

        psrld   xmm12,2
        paddd   xmm10,xmm8
        por     xmm12,xmm7
        movdqa  xmm0,XMMWORD[rbx]
        mov     ecx,1
        cmp     ecx,DWORD[rbx]
        pxor    xmm8,xmm8
        cmovge  r8,rbp
        cmp     ecx,DWORD[4+rbx]
        movdqa  xmm1,xmm0
        cmovge  r9,rbp
        cmp     ecx,DWORD[8+rbx]
        pcmpgtd xmm1,xmm8
        cmovge  r10,rbp
        cmp     ecx,DWORD[12+rbx]
        paddd   xmm0,xmm1
        cmovge  r11,rbp

        movdqu  xmm6,XMMWORD[rdi]
        pand    xmm10,xmm1
        movdqu  xmm7,XMMWORD[32+rdi]
        pand    xmm11,xmm1
        paddd   xmm10,xmm6
        movdqu  xmm8,XMMWORD[64+rdi]
        pand    xmm12,xmm1
        paddd   xmm11,xmm7
        movdqu  xmm9,XMMWORD[96+rdi]
        pand    xmm13,xmm1
        paddd   xmm12,xmm8
        movdqu  xmm5,XMMWORD[128+rdi]
        pand    xmm14,xmm1
        movdqu  XMMWORD[rdi],xmm10
        paddd   xmm13,xmm9
        movdqu  XMMWORD[32+rdi],xmm11
        paddd   xmm14,xmm5
        movdqu  XMMWORD[64+rdi],xmm12
        movdqu  XMMWORD[96+rdi],xmm13
        movdqu  XMMWORD[128+rdi],xmm14

        movdqa  XMMWORD[rbx],xmm0
        movdqa  xmm5,XMMWORD[96+rbp]
        movdqa  xmm15,XMMWORD[((-32))+rbp]
        dec     edx
        jnz     NEAR $L$oop

        mov     edx,DWORD[280+rsp]
        lea     rdi,[16+rdi]
        lea     rsi,[64+rsi]
        dec     edx
        jnz     NEAR $L$oop_grande

$L$done:
        mov     rax,QWORD[272+rsp]

        movaps  xmm6,XMMWORD[((-184))+rax]
        movaps  xmm7,XMMWORD[((-168))+rax]
        movaps  xmm8,XMMWORD[((-152))+rax]
        movaps  xmm9,XMMWORD[((-136))+rax]
        movaps  xmm10,XMMWORD[((-120))+rax]
        movaps  xmm11,XMMWORD[((-104))+rax]
        movaps  xmm12,XMMWORD[((-88))+rax]
        movaps  xmm13,XMMWORD[((-72))+rax]
        movaps  xmm14,XMMWORD[((-56))+rax]
        movaps  xmm15,XMMWORD[((-40))+rax]
        mov     rbp,QWORD[((-16))+rax]

        mov     rbx,QWORD[((-8))+rax]

        lea     rsp,[rax]

$L$epilogue:
        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret

$L$SEH_end_sha1_multi_block:

ALIGN   32
sha1_multi_block_shaext:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_sha1_multi_block_shaext:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8



_shaext_shortcut:
        mov     rax,rsp

        push    rbx

        push    rbp

        lea     rsp,[((-168))+rsp]
        movaps  XMMWORD[rsp],xmm6
        movaps  XMMWORD[16+rsp],xmm7
        movaps  XMMWORD[32+rsp],xmm8
        movaps  XMMWORD[48+rsp],xmm9
        movaps  XMMWORD[(-120)+rax],xmm10
        movaps  XMMWORD[(-104)+rax],xmm11
        movaps  XMMWORD[(-88)+rax],xmm12
        movaps  XMMWORD[(-72)+rax],xmm13
        movaps  XMMWORD[(-56)+rax],xmm14
        movaps  XMMWORD[(-40)+rax],xmm15
        sub     rsp,288
        shl     edx,1
        and     rsp,-256
        lea     rdi,[64+rdi]
        mov     QWORD[272+rsp],rax
$L$body_shaext:
        lea     rbx,[256+rsp]
        movdqa  xmm3,XMMWORD[((K_XX_XX+128))]

$L$oop_grande_shaext:
        mov     DWORD[280+rsp],edx
        xor     edx,edx

        mov     r8,QWORD[rsi]

        mov     ecx,DWORD[8+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[rbx],ecx
        cmovle  r8,rsp

        mov     r9,QWORD[16+rsi]

        mov     ecx,DWORD[24+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[4+rbx],ecx
        cmovle  r9,rsp
        test    edx,edx
        jz      NEAR $L$done_shaext

        movq    xmm0,QWORD[((0-64))+rdi]
        movq    xmm4,QWORD[((32-64))+rdi]
        movq    xmm5,QWORD[((64-64))+rdi]
        movq    xmm6,QWORD[((96-64))+rdi]
        movq    xmm7,QWORD[((128-64))+rdi]

        punpckldq       xmm0,xmm4
        punpckldq       xmm5,xmm6

        movdqa  xmm8,xmm0
        punpcklqdq      xmm0,xmm5
        punpckhqdq      xmm8,xmm5

        pshufd  xmm1,xmm7,63
        pshufd  xmm9,xmm7,127
        pshufd  xmm0,xmm0,27
        pshufd  xmm8,xmm8,27
        jmp     NEAR $L$oop_shaext

ALIGN   32
$L$oop_shaext:
        movdqu  xmm4,XMMWORD[r8]
        movdqu  xmm11,XMMWORD[r9]
        movdqu  xmm5,XMMWORD[16+r8]
        movdqu  xmm12,XMMWORD[16+r9]
        movdqu  xmm6,XMMWORD[32+r8]
DB      102,15,56,0,227
        movdqu  xmm13,XMMWORD[32+r9]
DB      102,68,15,56,0,219
        movdqu  xmm7,XMMWORD[48+r8]
        lea     r8,[64+r8]
DB      102,15,56,0,235
        movdqu  xmm14,XMMWORD[48+r9]
        lea     r9,[64+r9]
DB      102,68,15,56,0,227

        movdqa  XMMWORD[80+rsp],xmm1
        paddd   xmm1,xmm4
        movdqa  XMMWORD[112+rsp],xmm9
        paddd   xmm9,xmm11
        movdqa  XMMWORD[64+rsp],xmm0
        movdqa  xmm2,xmm0
        movdqa  XMMWORD[96+rsp],xmm8
        movdqa  xmm10,xmm8
DB      15,58,204,193,0
DB      15,56,200,213
DB      69,15,58,204,193,0
DB      69,15,56,200,212
DB      102,15,56,0,243
        prefetcht0      [127+r8]
DB      15,56,201,229
DB      102,68,15,56,0,235
        prefetcht0      [127+r9]
DB      69,15,56,201,220

DB      102,15,56,0,251
        movdqa  xmm1,xmm0
DB      102,68,15,56,0,243
        movdqa  xmm9,xmm8
DB      15,58,204,194,0
DB      15,56,200,206
DB      69,15,58,204,194,0
DB      69,15,56,200,205
        pxor    xmm4,xmm6
DB      15,56,201,238
        pxor    xmm11,xmm13
DB      69,15,56,201,229
        movdqa  xmm2,xmm0
        movdqa  xmm10,xmm8
DB      15,58,204,193,0
DB      15,56,200,215
DB      69,15,58,204,193,0
DB      69,15,56,200,214
DB      15,56,202,231
DB      69,15,56,202,222
        pxor    xmm5,xmm7
DB      15,56,201,247
        pxor    xmm12,xmm14
DB      69,15,56,201,238
        movdqa  xmm1,xmm0
        movdqa  xmm9,xmm8
DB      15,58,204,194,0
DB      15,56,200,204
DB      69,15,58,204,194,0
DB      69,15,56,200,203
DB      15,56,202,236
DB      69,15,56,202,227
        pxor    xmm6,xmm4
DB      15,56,201,252
        pxor    xmm13,xmm11
DB      69,15,56,201,243
        movdqa  xmm2,xmm0
        movdqa  xmm10,xmm8
DB      15,58,204,193,0
DB      15,56,200,213
DB      69,15,58,204,193,0
DB      69,15,56,200,212
DB      15,56,202,245
DB      69,15,56,202,236
        pxor    xmm7,xmm5
DB      15,56,201,229
        pxor    xmm14,xmm12
DB      69,15,56,201,220
        movdqa  xmm1,xmm0
        movdqa  xmm9,xmm8
DB      15,58,204,194,1
DB      15,56,200,206
DB      69,15,58,204,194,1
DB      69,15,56,200,205
DB      15,56,202,254
DB      69,15,56,202,245
        pxor    xmm4,xmm6
DB      15,56,201,238
        pxor    xmm11,xmm13
DB      69,15,56,201,229
        movdqa  xmm2,xmm0
        movdqa  xmm10,xmm8
DB      15,58,204,193,1
DB      15,56,200,215
DB      69,15,58,204,193,1
DB      69,15,56,200,214
DB      15,56,202,231
DB      69,15,56,202,222
        pxor    xmm5,xmm7
DB      15,56,201,247
        pxor    xmm12,xmm14
DB      69,15,56,201,238
        movdqa  xmm1,xmm0
        movdqa  xmm9,xmm8
DB      15,58,204,194,1
DB      15,56,200,204
DB      69,15,58,204,194,1
DB      69,15,56,200,203
DB      15,56,202,236
DB      69,15,56,202,227
        pxor    xmm6,xmm4
DB      15,56,201,252
        pxor    xmm13,xmm11
DB      69,15,56,201,243
        movdqa  xmm2,xmm0
        movdqa  xmm10,xmm8
DB      15,58,204,193,1
DB      15,56,200,213
DB      69,15,58,204,193,1
DB      69,15,56,200,212
DB      15,56,202,245
DB      69,15,56,202,236
        pxor    xmm7,xmm5
DB      15,56,201,229
        pxor    xmm14,xmm12
DB      69,15,56,201,220
        movdqa  xmm1,xmm0
        movdqa  xmm9,xmm8
DB      15,58,204,194,1
DB      15,56,200,206
DB      69,15,58,204,194,1
DB      69,15,56,200,205
DB      15,56,202,254
DB      69,15,56,202,245
        pxor    xmm4,xmm6
DB      15,56,201,238
        pxor    xmm11,xmm13
DB      69,15,56,201,229
        movdqa  xmm2,xmm0
        movdqa  xmm10,xmm8
DB      15,58,204,193,2
DB      15,56,200,215
DB      69,15,58,204,193,2
DB      69,15,56,200,214
DB      15,56,202,231
DB      69,15,56,202,222
        pxor    xmm5,xmm7
DB      15,56,201,247
        pxor    xmm12,xmm14
DB      69,15,56,201,238
        movdqa  xmm1,xmm0
        movdqa  xmm9,xmm8
DB      15,58,204,194,2
DB      15,56,200,204
DB      69,15,58,204,194,2
DB      69,15,56,200,203
DB      15,56,202,236
DB      69,15,56,202,227
        pxor    xmm6,xmm4
DB      15,56,201,252
        pxor    xmm13,xmm11
DB      69,15,56,201,243
        movdqa  xmm2,xmm0
        movdqa  xmm10,xmm8
DB      15,58,204,193,2
DB      15,56,200,213
DB      69,15,58,204,193,2
DB      69,15,56,200,212
DB      15,56,202,245
DB      69,15,56,202,236
        pxor    xmm7,xmm5
DB      15,56,201,229
        pxor    xmm14,xmm12
DB      69,15,56,201,220
        movdqa  xmm1,xmm0
        movdqa  xmm9,xmm8
DB      15,58,204,194,2
DB      15,56,200,206
DB      69,15,58,204,194,2
DB      69,15,56,200,205
DB      15,56,202,254
DB      69,15,56,202,245
        pxor    xmm4,xmm6
DB      15,56,201,238
        pxor    xmm11,xmm13
DB      69,15,56,201,229
        movdqa  xmm2,xmm0
        movdqa  xmm10,xmm8
DB      15,58,204,193,2
DB      15,56,200,215
DB      69,15,58,204,193,2
DB      69,15,56,200,214
DB      15,56,202,231
DB      69,15,56,202,222
        pxor    xmm5,xmm7
DB      15,56,201,247
        pxor    xmm12,xmm14
DB      69,15,56,201,238
        movdqa  xmm1,xmm0
        movdqa  xmm9,xmm8
DB      15,58,204,194,3
DB      15,56,200,204
DB      69,15,58,204,194,3
DB      69,15,56,200,203
DB      15,56,202,236
DB      69,15,56,202,227
        pxor    xmm6,xmm4
DB      15,56,201,252
        pxor    xmm13,xmm11
DB      69,15,56,201,243
        movdqa  xmm2,xmm0
        movdqa  xmm10,xmm8
DB      15,58,204,193,3
DB      15,56,200,213
DB      69,15,58,204,193,3
DB      69,15,56,200,212
DB      15,56,202,245
DB      69,15,56,202,236
        pxor    xmm7,xmm5
        pxor    xmm14,xmm12

        mov     ecx,1
        pxor    xmm4,xmm4
        cmp     ecx,DWORD[rbx]
        cmovge  r8,rsp

        movdqa  xmm1,xmm0
        movdqa  xmm9,xmm8
DB      15,58,204,194,3
DB      15,56,200,206
DB      69,15,58,204,194,3
DB      69,15,56,200,205
DB      15,56,202,254
DB      69,15,56,202,245

        cmp     ecx,DWORD[4+rbx]
        cmovge  r9,rsp
        movq    xmm6,QWORD[rbx]

        movdqa  xmm2,xmm0
        movdqa  xmm10,xmm8
DB      15,58,204,193,3
DB      15,56,200,215
DB      69,15,58,204,193,3
DB      69,15,56,200,214

        pshufd  xmm11,xmm6,0x00
        pshufd  xmm12,xmm6,0x55
        movdqa  xmm7,xmm6
        pcmpgtd xmm11,xmm4
        pcmpgtd xmm12,xmm4

        movdqa  xmm1,xmm0
        movdqa  xmm9,xmm8
DB      15,58,204,194,3
DB      15,56,200,204
DB      69,15,58,204,194,3
DB      68,15,56,200,204

        pcmpgtd xmm7,xmm4
        pand    xmm0,xmm11
        pand    xmm1,xmm11
        pand    xmm8,xmm12
        pand    xmm9,xmm12
        paddd   xmm6,xmm7

        paddd   xmm0,XMMWORD[64+rsp]
        paddd   xmm1,XMMWORD[80+rsp]
        paddd   xmm8,XMMWORD[96+rsp]
        paddd   xmm9,XMMWORD[112+rsp]

        movq    QWORD[rbx],xmm6
        dec     edx
        jnz     NEAR $L$oop_shaext

        mov     edx,DWORD[280+rsp]

        pshufd  xmm0,xmm0,27
        pshufd  xmm8,xmm8,27

        movdqa  xmm6,xmm0
        punpckldq       xmm0,xmm8
        punpckhdq       xmm6,xmm8
        punpckhdq       xmm1,xmm9
        movq    QWORD[(0-64)+rdi],xmm0
        psrldq  xmm0,8
        movq    QWORD[(64-64)+rdi],xmm6
        psrldq  xmm6,8
        movq    QWORD[(32-64)+rdi],xmm0
        psrldq  xmm1,8
        movq    QWORD[(96-64)+rdi],xmm6
        movq    QWORD[(128-64)+rdi],xmm1

        lea     rdi,[8+rdi]
        lea     rsi,[32+rsi]
        dec     edx
        jnz     NEAR $L$oop_grande_shaext

$L$done_shaext:

        movaps  xmm6,XMMWORD[((-184))+rax]
        movaps  xmm7,XMMWORD[((-168))+rax]
        movaps  xmm8,XMMWORD[((-152))+rax]
        movaps  xmm9,XMMWORD[((-136))+rax]
        movaps  xmm10,XMMWORD[((-120))+rax]
        movaps  xmm11,XMMWORD[((-104))+rax]
        movaps  xmm12,XMMWORD[((-88))+rax]
        movaps  xmm13,XMMWORD[((-72))+rax]
        movaps  xmm14,XMMWORD[((-56))+rax]
        movaps  xmm15,XMMWORD[((-40))+rax]
        mov     rbp,QWORD[((-16))+rax]

        mov     rbx,QWORD[((-8))+rax]

        lea     rsp,[rax]

$L$epilogue_shaext:
        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret

$L$SEH_end_sha1_multi_block_shaext:

ALIGN   32
sha1_multi_block_avx:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_sha1_multi_block_avx:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8



_avx_shortcut:
        shr     rcx,32
        cmp     edx,2
        jb      NEAR $L$avx
        test    ecx,32
        jnz     NEAR _avx2_shortcut
        jmp     NEAR $L$avx
ALIGN   32
$L$avx:
        mov     rax,rsp

        push    rbx

        push    rbp

        lea     rsp,[((-168))+rsp]
        movaps  XMMWORD[rsp],xmm6
        movaps  XMMWORD[16+rsp],xmm7
        movaps  XMMWORD[32+rsp],xmm8
        movaps  XMMWORD[48+rsp],xmm9
        movaps  XMMWORD[(-120)+rax],xmm10
        movaps  XMMWORD[(-104)+rax],xmm11
        movaps  XMMWORD[(-88)+rax],xmm12
        movaps  XMMWORD[(-72)+rax],xmm13
        movaps  XMMWORD[(-56)+rax],xmm14
        movaps  XMMWORD[(-40)+rax],xmm15
        sub     rsp,288
        and     rsp,-256
        mov     QWORD[272+rsp],rax

$L$body_avx:
        lea     rbp,[K_XX_XX]
        lea     rbx,[256+rsp]

        vzeroupper
$L$oop_grande_avx:
        mov     DWORD[280+rsp],edx
        xor     edx,edx

        mov     r8,QWORD[rsi]

        mov     ecx,DWORD[8+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[rbx],ecx
        cmovle  r8,rbp

        mov     r9,QWORD[16+rsi]

        mov     ecx,DWORD[24+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[4+rbx],ecx
        cmovle  r9,rbp

        mov     r10,QWORD[32+rsi]

        mov     ecx,DWORD[40+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[8+rbx],ecx
        cmovle  r10,rbp

        mov     r11,QWORD[48+rsi]

        mov     ecx,DWORD[56+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[12+rbx],ecx
        cmovle  r11,rbp
        test    edx,edx
        jz      NEAR $L$done_avx

        vmovdqu xmm10,XMMWORD[rdi]
        lea     rax,[128+rsp]
        vmovdqu xmm11,XMMWORD[32+rdi]
        vmovdqu xmm12,XMMWORD[64+rdi]
        vmovdqu xmm13,XMMWORD[96+rdi]
        vmovdqu xmm14,XMMWORD[128+rdi]
        vmovdqu xmm5,XMMWORD[96+rbp]
        jmp     NEAR $L$oop_avx

ALIGN   32
$L$oop_avx:
        vmovdqa xmm15,XMMWORD[((-32))+rbp]
        vmovd   xmm0,DWORD[r8]
        lea     r8,[64+r8]
        vmovd   xmm2,DWORD[r9]
        lea     r9,[64+r9]
        vpinsrd xmm0,xmm0,DWORD[r10],1
        lea     r10,[64+r10]
        vpinsrd xmm2,xmm2,DWORD[r11],1
        lea     r11,[64+r11]
        vmovd   xmm1,DWORD[((-60))+r8]
        vpunpckldq      xmm0,xmm0,xmm2
        vmovd   xmm9,DWORD[((-60))+r9]
        vpshufb xmm0,xmm0,xmm5
        vpinsrd xmm1,xmm1,DWORD[((-60))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-60))+r11],1
        vpaddd  xmm14,xmm14,xmm15
        vpslld  xmm8,xmm10,5
        vpandn  xmm7,xmm11,xmm13
        vpand   xmm6,xmm11,xmm12

        vmovdqa XMMWORD[(0-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpunpckldq      xmm1,xmm1,xmm9
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm2,DWORD[((-56))+r8]

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-56))+r9]
        vpaddd  xmm14,xmm14,xmm6

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpshufb xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpinsrd xmm2,xmm2,DWORD[((-56))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-56))+r11],1
        vpaddd  xmm13,xmm13,xmm15
        vpslld  xmm8,xmm14,5
        vpandn  xmm7,xmm10,xmm12
        vpand   xmm6,xmm10,xmm11

        vmovdqa XMMWORD[(16-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpunpckldq      xmm2,xmm2,xmm9
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm3,DWORD[((-52))+r8]

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-52))+r9]
        vpaddd  xmm13,xmm13,xmm6

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpshufb xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpinsrd xmm3,xmm3,DWORD[((-52))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-52))+r11],1
        vpaddd  xmm12,xmm12,xmm15
        vpslld  xmm8,xmm13,5
        vpandn  xmm7,xmm14,xmm11
        vpand   xmm6,xmm14,xmm10

        vmovdqa XMMWORD[(32-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpunpckldq      xmm3,xmm3,xmm9
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm4,DWORD[((-48))+r8]

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-48))+r9]
        vpaddd  xmm12,xmm12,xmm6

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpshufb xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpinsrd xmm4,xmm4,DWORD[((-48))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-48))+r11],1
        vpaddd  xmm11,xmm11,xmm15
        vpslld  xmm8,xmm12,5
        vpandn  xmm7,xmm13,xmm10
        vpand   xmm6,xmm13,xmm14

        vmovdqa XMMWORD[(48-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpunpckldq      xmm4,xmm4,xmm9
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm0,DWORD[((-44))+r8]

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-44))+r9]
        vpaddd  xmm11,xmm11,xmm6

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpshufb xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpinsrd xmm0,xmm0,DWORD[((-44))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-44))+r11],1
        vpaddd  xmm10,xmm10,xmm15
        vpslld  xmm8,xmm11,5
        vpandn  xmm7,xmm12,xmm14
        vpand   xmm6,xmm12,xmm13

        vmovdqa XMMWORD[(64-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpunpckldq      xmm0,xmm0,xmm9
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm1,DWORD[((-40))+r8]

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-40))+r9]
        vpaddd  xmm10,xmm10,xmm6

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpshufb xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpinsrd xmm1,xmm1,DWORD[((-40))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-40))+r11],1
        vpaddd  xmm14,xmm14,xmm15
        vpslld  xmm8,xmm10,5
        vpandn  xmm7,xmm11,xmm13
        vpand   xmm6,xmm11,xmm12

        vmovdqa XMMWORD[(80-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpunpckldq      xmm1,xmm1,xmm9
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm2,DWORD[((-36))+r8]

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-36))+r9]
        vpaddd  xmm14,xmm14,xmm6

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpshufb xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpinsrd xmm2,xmm2,DWORD[((-36))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-36))+r11],1
        vpaddd  xmm13,xmm13,xmm15
        vpslld  xmm8,xmm14,5
        vpandn  xmm7,xmm10,xmm12
        vpand   xmm6,xmm10,xmm11

        vmovdqa XMMWORD[(96-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpunpckldq      xmm2,xmm2,xmm9
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm3,DWORD[((-32))+r8]

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-32))+r9]
        vpaddd  xmm13,xmm13,xmm6

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpshufb xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpinsrd xmm3,xmm3,DWORD[((-32))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-32))+r11],1
        vpaddd  xmm12,xmm12,xmm15
        vpslld  xmm8,xmm13,5
        vpandn  xmm7,xmm14,xmm11
        vpand   xmm6,xmm14,xmm10

        vmovdqa XMMWORD[(112-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpunpckldq      xmm3,xmm3,xmm9
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm4,DWORD[((-28))+r8]

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-28))+r9]
        vpaddd  xmm12,xmm12,xmm6

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpshufb xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpinsrd xmm4,xmm4,DWORD[((-28))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-28))+r11],1
        vpaddd  xmm11,xmm11,xmm15
        vpslld  xmm8,xmm12,5
        vpandn  xmm7,xmm13,xmm10
        vpand   xmm6,xmm13,xmm14

        vmovdqa XMMWORD[(128-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpunpckldq      xmm4,xmm4,xmm9
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm0,DWORD[((-24))+r8]

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-24))+r9]
        vpaddd  xmm11,xmm11,xmm6

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpshufb xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpinsrd xmm0,xmm0,DWORD[((-24))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-24))+r11],1
        vpaddd  xmm10,xmm10,xmm15
        vpslld  xmm8,xmm11,5
        vpandn  xmm7,xmm12,xmm14
        vpand   xmm6,xmm12,xmm13

        vmovdqa XMMWORD[(144-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpunpckldq      xmm0,xmm0,xmm9
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm1,DWORD[((-20))+r8]

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-20))+r9]
        vpaddd  xmm10,xmm10,xmm6

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpshufb xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpinsrd xmm1,xmm1,DWORD[((-20))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-20))+r11],1
        vpaddd  xmm14,xmm14,xmm15
        vpslld  xmm8,xmm10,5
        vpandn  xmm7,xmm11,xmm13
        vpand   xmm6,xmm11,xmm12

        vmovdqa XMMWORD[(160-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpunpckldq      xmm1,xmm1,xmm9
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm2,DWORD[((-16))+r8]

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-16))+r9]
        vpaddd  xmm14,xmm14,xmm6

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpshufb xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpinsrd xmm2,xmm2,DWORD[((-16))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-16))+r11],1
        vpaddd  xmm13,xmm13,xmm15
        vpslld  xmm8,xmm14,5
        vpandn  xmm7,xmm10,xmm12
        vpand   xmm6,xmm10,xmm11

        vmovdqa XMMWORD[(176-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpunpckldq      xmm2,xmm2,xmm9
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm3,DWORD[((-12))+r8]

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-12))+r9]
        vpaddd  xmm13,xmm13,xmm6

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpshufb xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpinsrd xmm3,xmm3,DWORD[((-12))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-12))+r11],1
        vpaddd  xmm12,xmm12,xmm15
        vpslld  xmm8,xmm13,5
        vpandn  xmm7,xmm14,xmm11
        vpand   xmm6,xmm14,xmm10

        vmovdqa XMMWORD[(192-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpunpckldq      xmm3,xmm3,xmm9
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm4,DWORD[((-8))+r8]

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-8))+r9]
        vpaddd  xmm12,xmm12,xmm6

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpshufb xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpinsrd xmm4,xmm4,DWORD[((-8))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-8))+r11],1
        vpaddd  xmm11,xmm11,xmm15
        vpslld  xmm8,xmm12,5
        vpandn  xmm7,xmm13,xmm10
        vpand   xmm6,xmm13,xmm14

        vmovdqa XMMWORD[(208-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpunpckldq      xmm4,xmm4,xmm9
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm7
        vmovd   xmm0,DWORD[((-4))+r8]

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vmovd   xmm9,DWORD[((-4))+r9]
        vpaddd  xmm11,xmm11,xmm6

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpshufb xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vmovdqa xmm1,XMMWORD[((0-128))+rax]
        vpinsrd xmm0,xmm0,DWORD[((-4))+r10],1
        vpinsrd xmm9,xmm9,DWORD[((-4))+r11],1
        vpaddd  xmm10,xmm10,xmm15
        prefetcht0      [63+r8]
        vpslld  xmm8,xmm11,5
        vpandn  xmm7,xmm12,xmm14
        vpand   xmm6,xmm12,xmm13

        vmovdqa XMMWORD[(224-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpunpckldq      xmm0,xmm0,xmm9
        vpsrld  xmm9,xmm11,27
        prefetcht0      [63+r9]
        vpxor   xmm6,xmm6,xmm7

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        prefetcht0      [63+r10]
        vpaddd  xmm10,xmm10,xmm6

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        prefetcht0      [63+r11]
        vpshufb xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vmovdqa xmm2,XMMWORD[((16-128))+rax]
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((32-128))+rax]

        vpaddd  xmm14,xmm14,xmm15
        vpslld  xmm8,xmm10,5
        vpandn  xmm7,xmm11,xmm13

        vpand   xmm6,xmm11,xmm12

        vmovdqa XMMWORD[(240-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpxor   xmm1,xmm1,XMMWORD[((128-128))+rax]
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm7
        vpxor   xmm1,xmm1,xmm3


        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm14,xmm14,xmm6

        vpsrld  xmm5,xmm1,31
        vpaddd  xmm1,xmm1,xmm1

        vpsrld  xmm11,xmm11,2

        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((48-128))+rax]

        vpaddd  xmm13,xmm13,xmm15
        vpslld  xmm8,xmm14,5
        vpandn  xmm7,xmm10,xmm12

        vpand   xmm6,xmm10,xmm11

        vmovdqa XMMWORD[(0-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpxor   xmm2,xmm2,XMMWORD[((144-128))+rax]
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm7
        vpxor   xmm2,xmm2,xmm4


        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm13,xmm13,xmm6

        vpsrld  xmm5,xmm2,31
        vpaddd  xmm2,xmm2,xmm2

        vpsrld  xmm10,xmm10,2

        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((64-128))+rax]

        vpaddd  xmm12,xmm12,xmm15
        vpslld  xmm8,xmm13,5
        vpandn  xmm7,xmm14,xmm11

        vpand   xmm6,xmm14,xmm10

        vmovdqa XMMWORD[(16-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpxor   xmm3,xmm3,XMMWORD[((160-128))+rax]
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm7
        vpxor   xmm3,xmm3,xmm0


        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm12,xmm12,xmm6

        vpsrld  xmm5,xmm3,31
        vpaddd  xmm3,xmm3,xmm3

        vpsrld  xmm14,xmm14,2

        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((80-128))+rax]

        vpaddd  xmm11,xmm11,xmm15
        vpslld  xmm8,xmm12,5
        vpandn  xmm7,xmm13,xmm10

        vpand   xmm6,xmm13,xmm14

        vmovdqa XMMWORD[(32-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpxor   xmm4,xmm4,XMMWORD[((176-128))+rax]
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm7
        vpxor   xmm4,xmm4,xmm1


        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm11,xmm11,xmm6

        vpsrld  xmm5,xmm4,31
        vpaddd  xmm4,xmm4,xmm4

        vpsrld  xmm13,xmm13,2

        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((96-128))+rax]

        vpaddd  xmm10,xmm10,xmm15
        vpslld  xmm8,xmm11,5
        vpandn  xmm7,xmm12,xmm14

        vpand   xmm6,xmm12,xmm13

        vmovdqa XMMWORD[(48-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpxor   xmm0,xmm0,XMMWORD[((192-128))+rax]
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm6,xmm7
        vpxor   xmm0,xmm0,xmm2


        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm10,xmm10,xmm6

        vpsrld  xmm5,xmm0,31
        vpaddd  xmm0,xmm0,xmm0

        vpsrld  xmm12,xmm12,2

        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vmovdqa xmm15,XMMWORD[rbp]
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((112-128))+rax]

        vpslld  xmm8,xmm10,5
        vpaddd  xmm14,xmm14,xmm15
        vpxor   xmm6,xmm13,xmm11
        vmovdqa XMMWORD[(64-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpxor   xmm1,xmm1,XMMWORD[((208-128))+rax]
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm12
        vpxor   xmm1,xmm1,xmm3

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm14,xmm14,xmm6
        vpsrld  xmm5,xmm1,31
        vpaddd  xmm1,xmm1,xmm1

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((128-128))+rax]

        vpslld  xmm8,xmm14,5
        vpaddd  xmm13,xmm13,xmm15
        vpxor   xmm6,xmm12,xmm10
        vmovdqa XMMWORD[(80-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpxor   xmm2,xmm2,XMMWORD[((224-128))+rax]
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm11
        vpxor   xmm2,xmm2,xmm4

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm13,xmm13,xmm6
        vpsrld  xmm5,xmm2,31
        vpaddd  xmm2,xmm2,xmm2

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((144-128))+rax]

        vpslld  xmm8,xmm13,5
        vpaddd  xmm12,xmm12,xmm15
        vpxor   xmm6,xmm11,xmm14
        vmovdqa XMMWORD[(96-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpxor   xmm3,xmm3,XMMWORD[((240-128))+rax]
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm10
        vpxor   xmm3,xmm3,xmm0

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm12,xmm12,xmm6
        vpsrld  xmm5,xmm3,31
        vpaddd  xmm3,xmm3,xmm3

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((160-128))+rax]

        vpslld  xmm8,xmm12,5
        vpaddd  xmm11,xmm11,xmm15
        vpxor   xmm6,xmm10,xmm13
        vmovdqa XMMWORD[(112-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpxor   xmm4,xmm4,XMMWORD[((0-128))+rax]
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm14
        vpxor   xmm4,xmm4,xmm1

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm11,xmm11,xmm6
        vpsrld  xmm5,xmm4,31
        vpaddd  xmm4,xmm4,xmm4

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((176-128))+rax]

        vpslld  xmm8,xmm11,5
        vpaddd  xmm10,xmm10,xmm15
        vpxor   xmm6,xmm14,xmm12
        vmovdqa XMMWORD[(128-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpxor   xmm0,xmm0,XMMWORD[((16-128))+rax]
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm6,xmm13
        vpxor   xmm0,xmm0,xmm2

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm10,xmm10,xmm6
        vpsrld  xmm5,xmm0,31
        vpaddd  xmm0,xmm0,xmm0

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((192-128))+rax]

        vpslld  xmm8,xmm10,5
        vpaddd  xmm14,xmm14,xmm15
        vpxor   xmm6,xmm13,xmm11
        vmovdqa XMMWORD[(144-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpxor   xmm1,xmm1,XMMWORD[((32-128))+rax]
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm12
        vpxor   xmm1,xmm1,xmm3

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm14,xmm14,xmm6
        vpsrld  xmm5,xmm1,31
        vpaddd  xmm1,xmm1,xmm1

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((208-128))+rax]

        vpslld  xmm8,xmm14,5
        vpaddd  xmm13,xmm13,xmm15
        vpxor   xmm6,xmm12,xmm10
        vmovdqa XMMWORD[(160-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpxor   xmm2,xmm2,XMMWORD[((48-128))+rax]
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm11
        vpxor   xmm2,xmm2,xmm4

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm13,xmm13,xmm6
        vpsrld  xmm5,xmm2,31
        vpaddd  xmm2,xmm2,xmm2

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((224-128))+rax]

        vpslld  xmm8,xmm13,5
        vpaddd  xmm12,xmm12,xmm15
        vpxor   xmm6,xmm11,xmm14
        vmovdqa XMMWORD[(176-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpxor   xmm3,xmm3,XMMWORD[((64-128))+rax]
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm10
        vpxor   xmm3,xmm3,xmm0

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm12,xmm12,xmm6
        vpsrld  xmm5,xmm3,31
        vpaddd  xmm3,xmm3,xmm3

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((240-128))+rax]

        vpslld  xmm8,xmm12,5
        vpaddd  xmm11,xmm11,xmm15
        vpxor   xmm6,xmm10,xmm13
        vmovdqa XMMWORD[(192-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpxor   xmm4,xmm4,XMMWORD[((80-128))+rax]
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm14
        vpxor   xmm4,xmm4,xmm1

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm11,xmm11,xmm6
        vpsrld  xmm5,xmm4,31
        vpaddd  xmm4,xmm4,xmm4

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((0-128))+rax]

        vpslld  xmm8,xmm11,5
        vpaddd  xmm10,xmm10,xmm15
        vpxor   xmm6,xmm14,xmm12
        vmovdqa XMMWORD[(208-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpxor   xmm0,xmm0,XMMWORD[((96-128))+rax]
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm6,xmm13
        vpxor   xmm0,xmm0,xmm2

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm10,xmm10,xmm6
        vpsrld  xmm5,xmm0,31
        vpaddd  xmm0,xmm0,xmm0

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((16-128))+rax]

        vpslld  xmm8,xmm10,5
        vpaddd  xmm14,xmm14,xmm15
        vpxor   xmm6,xmm13,xmm11
        vmovdqa XMMWORD[(224-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpxor   xmm1,xmm1,XMMWORD[((112-128))+rax]
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm12
        vpxor   xmm1,xmm1,xmm3

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm14,xmm14,xmm6
        vpsrld  xmm5,xmm1,31
        vpaddd  xmm1,xmm1,xmm1

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((32-128))+rax]

        vpslld  xmm8,xmm14,5
        vpaddd  xmm13,xmm13,xmm15
        vpxor   xmm6,xmm12,xmm10
        vmovdqa XMMWORD[(240-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpxor   xmm2,xmm2,XMMWORD[((128-128))+rax]
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm11
        vpxor   xmm2,xmm2,xmm4

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm13,xmm13,xmm6
        vpsrld  xmm5,xmm2,31
        vpaddd  xmm2,xmm2,xmm2

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((48-128))+rax]

        vpslld  xmm8,xmm13,5
        vpaddd  xmm12,xmm12,xmm15
        vpxor   xmm6,xmm11,xmm14
        vmovdqa XMMWORD[(0-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpxor   xmm3,xmm3,XMMWORD[((144-128))+rax]
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm10
        vpxor   xmm3,xmm3,xmm0

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm12,xmm12,xmm6
        vpsrld  xmm5,xmm3,31
        vpaddd  xmm3,xmm3,xmm3

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((64-128))+rax]

        vpslld  xmm8,xmm12,5
        vpaddd  xmm11,xmm11,xmm15
        vpxor   xmm6,xmm10,xmm13
        vmovdqa XMMWORD[(16-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpxor   xmm4,xmm4,XMMWORD[((160-128))+rax]
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm14
        vpxor   xmm4,xmm4,xmm1

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm11,xmm11,xmm6
        vpsrld  xmm5,xmm4,31
        vpaddd  xmm4,xmm4,xmm4

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((80-128))+rax]

        vpslld  xmm8,xmm11,5
        vpaddd  xmm10,xmm10,xmm15
        vpxor   xmm6,xmm14,xmm12
        vmovdqa XMMWORD[(32-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpxor   xmm0,xmm0,XMMWORD[((176-128))+rax]
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm6,xmm13
        vpxor   xmm0,xmm0,xmm2

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm10,xmm10,xmm6
        vpsrld  xmm5,xmm0,31
        vpaddd  xmm0,xmm0,xmm0

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((96-128))+rax]

        vpslld  xmm8,xmm10,5
        vpaddd  xmm14,xmm14,xmm15
        vpxor   xmm6,xmm13,xmm11
        vmovdqa XMMWORD[(48-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpxor   xmm1,xmm1,XMMWORD[((192-128))+rax]
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm12
        vpxor   xmm1,xmm1,xmm3

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm14,xmm14,xmm6
        vpsrld  xmm5,xmm1,31
        vpaddd  xmm1,xmm1,xmm1

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((112-128))+rax]

        vpslld  xmm8,xmm14,5
        vpaddd  xmm13,xmm13,xmm15
        vpxor   xmm6,xmm12,xmm10
        vmovdqa XMMWORD[(64-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpxor   xmm2,xmm2,XMMWORD[((208-128))+rax]
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm11
        vpxor   xmm2,xmm2,xmm4

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm13,xmm13,xmm6
        vpsrld  xmm5,xmm2,31
        vpaddd  xmm2,xmm2,xmm2

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((128-128))+rax]

        vpslld  xmm8,xmm13,5
        vpaddd  xmm12,xmm12,xmm15
        vpxor   xmm6,xmm11,xmm14
        vmovdqa XMMWORD[(80-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpxor   xmm3,xmm3,XMMWORD[((224-128))+rax]
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm10
        vpxor   xmm3,xmm3,xmm0

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm12,xmm12,xmm6
        vpsrld  xmm5,xmm3,31
        vpaddd  xmm3,xmm3,xmm3

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((144-128))+rax]

        vpslld  xmm8,xmm12,5
        vpaddd  xmm11,xmm11,xmm15
        vpxor   xmm6,xmm10,xmm13
        vmovdqa XMMWORD[(96-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpxor   xmm4,xmm4,XMMWORD[((240-128))+rax]
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm14
        vpxor   xmm4,xmm4,xmm1

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm11,xmm11,xmm6
        vpsrld  xmm5,xmm4,31
        vpaddd  xmm4,xmm4,xmm4

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((160-128))+rax]

        vpslld  xmm8,xmm11,5
        vpaddd  xmm10,xmm10,xmm15
        vpxor   xmm6,xmm14,xmm12
        vmovdqa XMMWORD[(112-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpxor   xmm0,xmm0,XMMWORD[((0-128))+rax]
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm6,xmm13
        vpxor   xmm0,xmm0,xmm2

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm10,xmm10,xmm6
        vpsrld  xmm5,xmm0,31
        vpaddd  xmm0,xmm0,xmm0

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vmovdqa xmm15,XMMWORD[32+rbp]
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((176-128))+rax]

        vpaddd  xmm14,xmm14,xmm15
        vpslld  xmm8,xmm10,5
        vpand   xmm7,xmm13,xmm12
        vpxor   xmm1,xmm1,XMMWORD[((16-128))+rax]

        vpaddd  xmm14,xmm14,xmm7
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm13,xmm12
        vpxor   xmm1,xmm1,xmm3

        vmovdqu XMMWORD[(128-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm1,31
        vpand   xmm6,xmm6,xmm11
        vpaddd  xmm1,xmm1,xmm1

        vpslld  xmm7,xmm11,30
        vpaddd  xmm14,xmm14,xmm6

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((192-128))+rax]

        vpaddd  xmm13,xmm13,xmm15
        vpslld  xmm8,xmm14,5
        vpand   xmm7,xmm12,xmm11
        vpxor   xmm2,xmm2,XMMWORD[((32-128))+rax]

        vpaddd  xmm13,xmm13,xmm7
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm12,xmm11
        vpxor   xmm2,xmm2,xmm4

        vmovdqu XMMWORD[(144-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm2,31
        vpand   xmm6,xmm6,xmm10
        vpaddd  xmm2,xmm2,xmm2

        vpslld  xmm7,xmm10,30
        vpaddd  xmm13,xmm13,xmm6

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((208-128))+rax]

        vpaddd  xmm12,xmm12,xmm15
        vpslld  xmm8,xmm13,5
        vpand   xmm7,xmm11,xmm10
        vpxor   xmm3,xmm3,XMMWORD[((48-128))+rax]

        vpaddd  xmm12,xmm12,xmm7
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm11,xmm10
        vpxor   xmm3,xmm3,xmm0

        vmovdqu XMMWORD[(160-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm3,31
        vpand   xmm6,xmm6,xmm14
        vpaddd  xmm3,xmm3,xmm3

        vpslld  xmm7,xmm14,30
        vpaddd  xmm12,xmm12,xmm6

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((224-128))+rax]

        vpaddd  xmm11,xmm11,xmm15
        vpslld  xmm8,xmm12,5
        vpand   xmm7,xmm10,xmm14
        vpxor   xmm4,xmm4,XMMWORD[((64-128))+rax]

        vpaddd  xmm11,xmm11,xmm7
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm10,xmm14
        vpxor   xmm4,xmm4,xmm1

        vmovdqu XMMWORD[(176-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm4,31
        vpand   xmm6,xmm6,xmm13
        vpaddd  xmm4,xmm4,xmm4

        vpslld  xmm7,xmm13,30
        vpaddd  xmm11,xmm11,xmm6

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((240-128))+rax]

        vpaddd  xmm10,xmm10,xmm15
        vpslld  xmm8,xmm11,5
        vpand   xmm7,xmm14,xmm13
        vpxor   xmm0,xmm0,XMMWORD[((80-128))+rax]

        vpaddd  xmm10,xmm10,xmm7
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm14,xmm13
        vpxor   xmm0,xmm0,xmm2

        vmovdqu XMMWORD[(192-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm0,31
        vpand   xmm6,xmm6,xmm12
        vpaddd  xmm0,xmm0,xmm0

        vpslld  xmm7,xmm12,30
        vpaddd  xmm10,xmm10,xmm6

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((0-128))+rax]

        vpaddd  xmm14,xmm14,xmm15
        vpslld  xmm8,xmm10,5
        vpand   xmm7,xmm13,xmm12
        vpxor   xmm1,xmm1,XMMWORD[((96-128))+rax]

        vpaddd  xmm14,xmm14,xmm7
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm13,xmm12
        vpxor   xmm1,xmm1,xmm3

        vmovdqu XMMWORD[(208-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm1,31
        vpand   xmm6,xmm6,xmm11
        vpaddd  xmm1,xmm1,xmm1

        vpslld  xmm7,xmm11,30
        vpaddd  xmm14,xmm14,xmm6

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((16-128))+rax]

        vpaddd  xmm13,xmm13,xmm15
        vpslld  xmm8,xmm14,5
        vpand   xmm7,xmm12,xmm11
        vpxor   xmm2,xmm2,XMMWORD[((112-128))+rax]

        vpaddd  xmm13,xmm13,xmm7
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm12,xmm11
        vpxor   xmm2,xmm2,xmm4

        vmovdqu XMMWORD[(224-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm2,31
        vpand   xmm6,xmm6,xmm10
        vpaddd  xmm2,xmm2,xmm2

        vpslld  xmm7,xmm10,30
        vpaddd  xmm13,xmm13,xmm6

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((32-128))+rax]

        vpaddd  xmm12,xmm12,xmm15
        vpslld  xmm8,xmm13,5
        vpand   xmm7,xmm11,xmm10
        vpxor   xmm3,xmm3,XMMWORD[((128-128))+rax]

        vpaddd  xmm12,xmm12,xmm7
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm11,xmm10
        vpxor   xmm3,xmm3,xmm0

        vmovdqu XMMWORD[(240-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm3,31
        vpand   xmm6,xmm6,xmm14
        vpaddd  xmm3,xmm3,xmm3

        vpslld  xmm7,xmm14,30
        vpaddd  xmm12,xmm12,xmm6

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((48-128))+rax]

        vpaddd  xmm11,xmm11,xmm15
        vpslld  xmm8,xmm12,5
        vpand   xmm7,xmm10,xmm14
        vpxor   xmm4,xmm4,XMMWORD[((144-128))+rax]

        vpaddd  xmm11,xmm11,xmm7
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm10,xmm14
        vpxor   xmm4,xmm4,xmm1

        vmovdqu XMMWORD[(0-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm4,31
        vpand   xmm6,xmm6,xmm13
        vpaddd  xmm4,xmm4,xmm4

        vpslld  xmm7,xmm13,30
        vpaddd  xmm11,xmm11,xmm6

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((64-128))+rax]

        vpaddd  xmm10,xmm10,xmm15
        vpslld  xmm8,xmm11,5
        vpand   xmm7,xmm14,xmm13
        vpxor   xmm0,xmm0,XMMWORD[((160-128))+rax]

        vpaddd  xmm10,xmm10,xmm7
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm14,xmm13
        vpxor   xmm0,xmm0,xmm2

        vmovdqu XMMWORD[(16-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm0,31
        vpand   xmm6,xmm6,xmm12
        vpaddd  xmm0,xmm0,xmm0

        vpslld  xmm7,xmm12,30
        vpaddd  xmm10,xmm10,xmm6

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((80-128))+rax]

        vpaddd  xmm14,xmm14,xmm15
        vpslld  xmm8,xmm10,5
        vpand   xmm7,xmm13,xmm12
        vpxor   xmm1,xmm1,XMMWORD[((176-128))+rax]

        vpaddd  xmm14,xmm14,xmm7
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm13,xmm12
        vpxor   xmm1,xmm1,xmm3

        vmovdqu XMMWORD[(32-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm1,31
        vpand   xmm6,xmm6,xmm11
        vpaddd  xmm1,xmm1,xmm1

        vpslld  xmm7,xmm11,30
        vpaddd  xmm14,xmm14,xmm6

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((96-128))+rax]

        vpaddd  xmm13,xmm13,xmm15
        vpslld  xmm8,xmm14,5
        vpand   xmm7,xmm12,xmm11
        vpxor   xmm2,xmm2,XMMWORD[((192-128))+rax]

        vpaddd  xmm13,xmm13,xmm7
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm12,xmm11
        vpxor   xmm2,xmm2,xmm4

        vmovdqu XMMWORD[(48-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm2,31
        vpand   xmm6,xmm6,xmm10
        vpaddd  xmm2,xmm2,xmm2

        vpslld  xmm7,xmm10,30
        vpaddd  xmm13,xmm13,xmm6

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((112-128))+rax]

        vpaddd  xmm12,xmm12,xmm15
        vpslld  xmm8,xmm13,5
        vpand   xmm7,xmm11,xmm10
        vpxor   xmm3,xmm3,XMMWORD[((208-128))+rax]

        vpaddd  xmm12,xmm12,xmm7
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm11,xmm10
        vpxor   xmm3,xmm3,xmm0

        vmovdqu XMMWORD[(64-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm3,31
        vpand   xmm6,xmm6,xmm14
        vpaddd  xmm3,xmm3,xmm3

        vpslld  xmm7,xmm14,30
        vpaddd  xmm12,xmm12,xmm6

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((128-128))+rax]

        vpaddd  xmm11,xmm11,xmm15
        vpslld  xmm8,xmm12,5
        vpand   xmm7,xmm10,xmm14
        vpxor   xmm4,xmm4,XMMWORD[((224-128))+rax]

        vpaddd  xmm11,xmm11,xmm7
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm10,xmm14
        vpxor   xmm4,xmm4,xmm1

        vmovdqu XMMWORD[(80-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm4,31
        vpand   xmm6,xmm6,xmm13
        vpaddd  xmm4,xmm4,xmm4

        vpslld  xmm7,xmm13,30
        vpaddd  xmm11,xmm11,xmm6

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((144-128))+rax]

        vpaddd  xmm10,xmm10,xmm15
        vpslld  xmm8,xmm11,5
        vpand   xmm7,xmm14,xmm13
        vpxor   xmm0,xmm0,XMMWORD[((240-128))+rax]

        vpaddd  xmm10,xmm10,xmm7
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm14,xmm13
        vpxor   xmm0,xmm0,xmm2

        vmovdqu XMMWORD[(96-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm0,31
        vpand   xmm6,xmm6,xmm12
        vpaddd  xmm0,xmm0,xmm0

        vpslld  xmm7,xmm12,30
        vpaddd  xmm10,xmm10,xmm6

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((160-128))+rax]

        vpaddd  xmm14,xmm14,xmm15
        vpslld  xmm8,xmm10,5
        vpand   xmm7,xmm13,xmm12
        vpxor   xmm1,xmm1,XMMWORD[((0-128))+rax]

        vpaddd  xmm14,xmm14,xmm7
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm13,xmm12
        vpxor   xmm1,xmm1,xmm3

        vmovdqu XMMWORD[(112-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm1,31
        vpand   xmm6,xmm6,xmm11
        vpaddd  xmm1,xmm1,xmm1

        vpslld  xmm7,xmm11,30
        vpaddd  xmm14,xmm14,xmm6

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((176-128))+rax]

        vpaddd  xmm13,xmm13,xmm15
        vpslld  xmm8,xmm14,5
        vpand   xmm7,xmm12,xmm11
        vpxor   xmm2,xmm2,XMMWORD[((16-128))+rax]

        vpaddd  xmm13,xmm13,xmm7
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm12,xmm11
        vpxor   xmm2,xmm2,xmm4

        vmovdqu XMMWORD[(128-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm2,31
        vpand   xmm6,xmm6,xmm10
        vpaddd  xmm2,xmm2,xmm2

        vpslld  xmm7,xmm10,30
        vpaddd  xmm13,xmm13,xmm6

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((192-128))+rax]

        vpaddd  xmm12,xmm12,xmm15
        vpslld  xmm8,xmm13,5
        vpand   xmm7,xmm11,xmm10
        vpxor   xmm3,xmm3,XMMWORD[((32-128))+rax]

        vpaddd  xmm12,xmm12,xmm7
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm11,xmm10
        vpxor   xmm3,xmm3,xmm0

        vmovdqu XMMWORD[(144-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm3,31
        vpand   xmm6,xmm6,xmm14
        vpaddd  xmm3,xmm3,xmm3

        vpslld  xmm7,xmm14,30
        vpaddd  xmm12,xmm12,xmm6

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((208-128))+rax]

        vpaddd  xmm11,xmm11,xmm15
        vpslld  xmm8,xmm12,5
        vpand   xmm7,xmm10,xmm14
        vpxor   xmm4,xmm4,XMMWORD[((48-128))+rax]

        vpaddd  xmm11,xmm11,xmm7
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm10,xmm14
        vpxor   xmm4,xmm4,xmm1

        vmovdqu XMMWORD[(160-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm4,31
        vpand   xmm6,xmm6,xmm13
        vpaddd  xmm4,xmm4,xmm4

        vpslld  xmm7,xmm13,30
        vpaddd  xmm11,xmm11,xmm6

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((224-128))+rax]

        vpaddd  xmm10,xmm10,xmm15
        vpslld  xmm8,xmm11,5
        vpand   xmm7,xmm14,xmm13
        vpxor   xmm0,xmm0,XMMWORD[((64-128))+rax]

        vpaddd  xmm10,xmm10,xmm7
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm14,xmm13
        vpxor   xmm0,xmm0,xmm2

        vmovdqu XMMWORD[(176-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpor    xmm8,xmm8,xmm9
        vpsrld  xmm5,xmm0,31
        vpand   xmm6,xmm6,xmm12
        vpaddd  xmm0,xmm0,xmm0

        vpslld  xmm7,xmm12,30
        vpaddd  xmm10,xmm10,xmm6

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vmovdqa xmm15,XMMWORD[64+rbp]
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((240-128))+rax]

        vpslld  xmm8,xmm10,5
        vpaddd  xmm14,xmm14,xmm15
        vpxor   xmm6,xmm13,xmm11
        vmovdqa XMMWORD[(192-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpxor   xmm1,xmm1,XMMWORD[((80-128))+rax]
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm12
        vpxor   xmm1,xmm1,xmm3

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm14,xmm14,xmm6
        vpsrld  xmm5,xmm1,31
        vpaddd  xmm1,xmm1,xmm1

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((0-128))+rax]

        vpslld  xmm8,xmm14,5
        vpaddd  xmm13,xmm13,xmm15
        vpxor   xmm6,xmm12,xmm10
        vmovdqa XMMWORD[(208-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpxor   xmm2,xmm2,XMMWORD[((96-128))+rax]
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm11
        vpxor   xmm2,xmm2,xmm4

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm13,xmm13,xmm6
        vpsrld  xmm5,xmm2,31
        vpaddd  xmm2,xmm2,xmm2

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((16-128))+rax]

        vpslld  xmm8,xmm13,5
        vpaddd  xmm12,xmm12,xmm15
        vpxor   xmm6,xmm11,xmm14
        vmovdqa XMMWORD[(224-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpxor   xmm3,xmm3,XMMWORD[((112-128))+rax]
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm10
        vpxor   xmm3,xmm3,xmm0

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm12,xmm12,xmm6
        vpsrld  xmm5,xmm3,31
        vpaddd  xmm3,xmm3,xmm3

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((32-128))+rax]

        vpslld  xmm8,xmm12,5
        vpaddd  xmm11,xmm11,xmm15
        vpxor   xmm6,xmm10,xmm13
        vmovdqa XMMWORD[(240-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpxor   xmm4,xmm4,XMMWORD[((128-128))+rax]
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm14
        vpxor   xmm4,xmm4,xmm1

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm11,xmm11,xmm6
        vpsrld  xmm5,xmm4,31
        vpaddd  xmm4,xmm4,xmm4

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((48-128))+rax]

        vpslld  xmm8,xmm11,5
        vpaddd  xmm10,xmm10,xmm15
        vpxor   xmm6,xmm14,xmm12
        vmovdqa XMMWORD[(0-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpxor   xmm0,xmm0,XMMWORD[((144-128))+rax]
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm6,xmm13
        vpxor   xmm0,xmm0,xmm2

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm10,xmm10,xmm6
        vpsrld  xmm5,xmm0,31
        vpaddd  xmm0,xmm0,xmm0

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((64-128))+rax]

        vpslld  xmm8,xmm10,5
        vpaddd  xmm14,xmm14,xmm15
        vpxor   xmm6,xmm13,xmm11
        vmovdqa XMMWORD[(16-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpxor   xmm1,xmm1,XMMWORD[((160-128))+rax]
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm12
        vpxor   xmm1,xmm1,xmm3

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm14,xmm14,xmm6
        vpsrld  xmm5,xmm1,31
        vpaddd  xmm1,xmm1,xmm1

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((80-128))+rax]

        vpslld  xmm8,xmm14,5
        vpaddd  xmm13,xmm13,xmm15
        vpxor   xmm6,xmm12,xmm10
        vmovdqa XMMWORD[(32-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpxor   xmm2,xmm2,XMMWORD[((176-128))+rax]
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm11
        vpxor   xmm2,xmm2,xmm4

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm13,xmm13,xmm6
        vpsrld  xmm5,xmm2,31
        vpaddd  xmm2,xmm2,xmm2

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((96-128))+rax]

        vpslld  xmm8,xmm13,5
        vpaddd  xmm12,xmm12,xmm15
        vpxor   xmm6,xmm11,xmm14
        vmovdqa XMMWORD[(48-128)+rax],xmm2
        vpaddd  xmm12,xmm12,xmm2
        vpxor   xmm3,xmm3,XMMWORD[((192-128))+rax]
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm10
        vpxor   xmm3,xmm3,xmm0

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm12,xmm12,xmm6
        vpsrld  xmm5,xmm3,31
        vpaddd  xmm3,xmm3,xmm3

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((112-128))+rax]

        vpslld  xmm8,xmm12,5
        vpaddd  xmm11,xmm11,xmm15
        vpxor   xmm6,xmm10,xmm13
        vmovdqa XMMWORD[(64-128)+rax],xmm3
        vpaddd  xmm11,xmm11,xmm3
        vpxor   xmm4,xmm4,XMMWORD[((208-128))+rax]
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm14
        vpxor   xmm4,xmm4,xmm1

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm11,xmm11,xmm6
        vpsrld  xmm5,xmm4,31
        vpaddd  xmm4,xmm4,xmm4

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((128-128))+rax]

        vpslld  xmm8,xmm11,5
        vpaddd  xmm10,xmm10,xmm15
        vpxor   xmm6,xmm14,xmm12
        vmovdqa XMMWORD[(80-128)+rax],xmm4
        vpaddd  xmm10,xmm10,xmm4
        vpxor   xmm0,xmm0,XMMWORD[((224-128))+rax]
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm6,xmm13
        vpxor   xmm0,xmm0,xmm2

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm10,xmm10,xmm6
        vpsrld  xmm5,xmm0,31
        vpaddd  xmm0,xmm0,xmm0

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((144-128))+rax]

        vpslld  xmm8,xmm10,5
        vpaddd  xmm14,xmm14,xmm15
        vpxor   xmm6,xmm13,xmm11
        vmovdqa XMMWORD[(96-128)+rax],xmm0
        vpaddd  xmm14,xmm14,xmm0
        vpxor   xmm1,xmm1,XMMWORD[((240-128))+rax]
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm12
        vpxor   xmm1,xmm1,xmm3

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm14,xmm14,xmm6
        vpsrld  xmm5,xmm1,31
        vpaddd  xmm1,xmm1,xmm1

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((160-128))+rax]

        vpslld  xmm8,xmm14,5
        vpaddd  xmm13,xmm13,xmm15
        vpxor   xmm6,xmm12,xmm10
        vmovdqa XMMWORD[(112-128)+rax],xmm1
        vpaddd  xmm13,xmm13,xmm1
        vpxor   xmm2,xmm2,XMMWORD[((0-128))+rax]
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm11
        vpxor   xmm2,xmm2,xmm4

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm13,xmm13,xmm6
        vpsrld  xmm5,xmm2,31
        vpaddd  xmm2,xmm2,xmm2

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((176-128))+rax]

        vpslld  xmm8,xmm13,5
        vpaddd  xmm12,xmm12,xmm15
        vpxor   xmm6,xmm11,xmm14
        vpaddd  xmm12,xmm12,xmm2
        vpxor   xmm3,xmm3,XMMWORD[((16-128))+rax]
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm10
        vpxor   xmm3,xmm3,xmm0

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm12,xmm12,xmm6
        vpsrld  xmm5,xmm3,31
        vpaddd  xmm3,xmm3,xmm3

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((192-128))+rax]

        vpslld  xmm8,xmm12,5
        vpaddd  xmm11,xmm11,xmm15
        vpxor   xmm6,xmm10,xmm13
        vpaddd  xmm11,xmm11,xmm3
        vpxor   xmm4,xmm4,XMMWORD[((32-128))+rax]
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm14
        vpxor   xmm4,xmm4,xmm1

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm11,xmm11,xmm6
        vpsrld  xmm5,xmm4,31
        vpaddd  xmm4,xmm4,xmm4

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpxor   xmm0,xmm0,xmm2
        vmovdqa xmm2,XMMWORD[((208-128))+rax]

        vpslld  xmm8,xmm11,5
        vpaddd  xmm10,xmm10,xmm15
        vpxor   xmm6,xmm14,xmm12
        vpaddd  xmm10,xmm10,xmm4
        vpxor   xmm0,xmm0,XMMWORD[((48-128))+rax]
        vpsrld  xmm9,xmm11,27
        vpxor   xmm6,xmm6,xmm13
        vpxor   xmm0,xmm0,xmm2

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm10,xmm10,xmm6
        vpsrld  xmm5,xmm0,31
        vpaddd  xmm0,xmm0,xmm0

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm0,xmm0,xmm5
        vpor    xmm12,xmm12,xmm7
        vpxor   xmm1,xmm1,xmm3
        vmovdqa xmm3,XMMWORD[((224-128))+rax]

        vpslld  xmm8,xmm10,5
        vpaddd  xmm14,xmm14,xmm15
        vpxor   xmm6,xmm13,xmm11
        vpaddd  xmm14,xmm14,xmm0
        vpxor   xmm1,xmm1,XMMWORD[((64-128))+rax]
        vpsrld  xmm9,xmm10,27
        vpxor   xmm6,xmm6,xmm12
        vpxor   xmm1,xmm1,xmm3

        vpslld  xmm7,xmm11,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm14,xmm14,xmm6
        vpsrld  xmm5,xmm1,31
        vpaddd  xmm1,xmm1,xmm1

        vpsrld  xmm11,xmm11,2
        vpaddd  xmm14,xmm14,xmm8
        vpor    xmm1,xmm1,xmm5
        vpor    xmm11,xmm11,xmm7
        vpxor   xmm2,xmm2,xmm4
        vmovdqa xmm4,XMMWORD[((240-128))+rax]

        vpslld  xmm8,xmm14,5
        vpaddd  xmm13,xmm13,xmm15
        vpxor   xmm6,xmm12,xmm10
        vpaddd  xmm13,xmm13,xmm1
        vpxor   xmm2,xmm2,XMMWORD[((80-128))+rax]
        vpsrld  xmm9,xmm14,27
        vpxor   xmm6,xmm6,xmm11
        vpxor   xmm2,xmm2,xmm4

        vpslld  xmm7,xmm10,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm13,xmm13,xmm6
        vpsrld  xmm5,xmm2,31
        vpaddd  xmm2,xmm2,xmm2

        vpsrld  xmm10,xmm10,2
        vpaddd  xmm13,xmm13,xmm8
        vpor    xmm2,xmm2,xmm5
        vpor    xmm10,xmm10,xmm7
        vpxor   xmm3,xmm3,xmm0
        vmovdqa xmm0,XMMWORD[((0-128))+rax]

        vpslld  xmm8,xmm13,5
        vpaddd  xmm12,xmm12,xmm15
        vpxor   xmm6,xmm11,xmm14
        vpaddd  xmm12,xmm12,xmm2
        vpxor   xmm3,xmm3,XMMWORD[((96-128))+rax]
        vpsrld  xmm9,xmm13,27
        vpxor   xmm6,xmm6,xmm10
        vpxor   xmm3,xmm3,xmm0

        vpslld  xmm7,xmm14,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm12,xmm12,xmm6
        vpsrld  xmm5,xmm3,31
        vpaddd  xmm3,xmm3,xmm3

        vpsrld  xmm14,xmm14,2
        vpaddd  xmm12,xmm12,xmm8
        vpor    xmm3,xmm3,xmm5
        vpor    xmm14,xmm14,xmm7
        vpxor   xmm4,xmm4,xmm1
        vmovdqa xmm1,XMMWORD[((16-128))+rax]

        vpslld  xmm8,xmm12,5
        vpaddd  xmm11,xmm11,xmm15
        vpxor   xmm6,xmm10,xmm13
        vpaddd  xmm11,xmm11,xmm3
        vpxor   xmm4,xmm4,XMMWORD[((112-128))+rax]
        vpsrld  xmm9,xmm12,27
        vpxor   xmm6,xmm6,xmm14
        vpxor   xmm4,xmm4,xmm1

        vpslld  xmm7,xmm13,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm11,xmm11,xmm6
        vpsrld  xmm5,xmm4,31
        vpaddd  xmm4,xmm4,xmm4

        vpsrld  xmm13,xmm13,2
        vpaddd  xmm11,xmm11,xmm8
        vpor    xmm4,xmm4,xmm5
        vpor    xmm13,xmm13,xmm7
        vpslld  xmm8,xmm11,5
        vpaddd  xmm10,xmm10,xmm15
        vpxor   xmm6,xmm14,xmm12

        vpsrld  xmm9,xmm11,27
        vpaddd  xmm10,xmm10,xmm4
        vpxor   xmm6,xmm6,xmm13

        vpslld  xmm7,xmm12,30
        vpor    xmm8,xmm8,xmm9
        vpaddd  xmm10,xmm10,xmm6

        vpsrld  xmm12,xmm12,2
        vpaddd  xmm10,xmm10,xmm8
        vpor    xmm12,xmm12,xmm7
        mov     ecx,1
        cmp     ecx,DWORD[rbx]
        cmovge  r8,rbp
        cmp     ecx,DWORD[4+rbx]
        cmovge  r9,rbp
        cmp     ecx,DWORD[8+rbx]
        cmovge  r10,rbp
        cmp     ecx,DWORD[12+rbx]
        cmovge  r11,rbp
        vmovdqu xmm6,XMMWORD[rbx]
        vpxor   xmm8,xmm8,xmm8
        vmovdqa xmm7,xmm6
        vpcmpgtd        xmm7,xmm7,xmm8
        vpaddd  xmm6,xmm6,xmm7

        vpand   xmm10,xmm10,xmm7
        vpand   xmm11,xmm11,xmm7
        vpaddd  xmm10,xmm10,XMMWORD[rdi]
        vpand   xmm12,xmm12,xmm7
        vpaddd  xmm11,xmm11,XMMWORD[32+rdi]
        vpand   xmm13,xmm13,xmm7
        vpaddd  xmm12,xmm12,XMMWORD[64+rdi]
        vpand   xmm14,xmm14,xmm7
        vpaddd  xmm13,xmm13,XMMWORD[96+rdi]
        vpaddd  xmm14,xmm14,XMMWORD[128+rdi]
        vmovdqu XMMWORD[rdi],xmm10
        vmovdqu XMMWORD[32+rdi],xmm11
        vmovdqu XMMWORD[64+rdi],xmm12
        vmovdqu XMMWORD[96+rdi],xmm13
        vmovdqu XMMWORD[128+rdi],xmm14

        vmovdqu XMMWORD[rbx],xmm6
        vmovdqu xmm5,XMMWORD[96+rbp]
        dec     edx
        jnz     NEAR $L$oop_avx

        mov     edx,DWORD[280+rsp]
        lea     rdi,[16+rdi]
        lea     rsi,[64+rsi]
        dec     edx
        jnz     NEAR $L$oop_grande_avx

$L$done_avx:
        mov     rax,QWORD[272+rsp]

        vzeroupper
        movaps  xmm6,XMMWORD[((-184))+rax]
        movaps  xmm7,XMMWORD[((-168))+rax]
        movaps  xmm8,XMMWORD[((-152))+rax]
        movaps  xmm9,XMMWORD[((-136))+rax]
        movaps  xmm10,XMMWORD[((-120))+rax]
        movaps  xmm11,XMMWORD[((-104))+rax]
        movaps  xmm12,XMMWORD[((-88))+rax]
        movaps  xmm13,XMMWORD[((-72))+rax]
        movaps  xmm14,XMMWORD[((-56))+rax]
        movaps  xmm15,XMMWORD[((-40))+rax]
        mov     rbp,QWORD[((-16))+rax]

        mov     rbx,QWORD[((-8))+rax]

        lea     rsp,[rax]

$L$epilogue_avx:
        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret

$L$SEH_end_sha1_multi_block_avx:

ALIGN   32
sha1_multi_block_avx2:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_sha1_multi_block_avx2:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8



_avx2_shortcut:
        mov     rax,rsp

        push    rbx

        push    rbp

        push    r12

        push    r13

        push    r14

        push    r15

        lea     rsp,[((-168))+rsp]
        movaps  XMMWORD[rsp],xmm6
        movaps  XMMWORD[16+rsp],xmm7
        movaps  XMMWORD[32+rsp],xmm8
        movaps  XMMWORD[48+rsp],xmm9
        movaps  XMMWORD[64+rsp],xmm10
        movaps  XMMWORD[80+rsp],xmm11
        movaps  XMMWORD[(-120)+rax],xmm12
        movaps  XMMWORD[(-104)+rax],xmm13
        movaps  XMMWORD[(-88)+rax],xmm14
        movaps  XMMWORD[(-72)+rax],xmm15
        sub     rsp,576
        and     rsp,-256
        mov     QWORD[544+rsp],rax

$L$body_avx2:
        lea     rbp,[K_XX_XX]
        shr     edx,1

        vzeroupper
$L$oop_grande_avx2:
        mov     DWORD[552+rsp],edx
        xor     edx,edx
        lea     rbx,[512+rsp]

        mov     r12,QWORD[rsi]

        mov     ecx,DWORD[8+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[rbx],ecx
        cmovle  r12,rbp

        mov     r13,QWORD[16+rsi]

        mov     ecx,DWORD[24+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[4+rbx],ecx
        cmovle  r13,rbp

        mov     r14,QWORD[32+rsi]

        mov     ecx,DWORD[40+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[8+rbx],ecx
        cmovle  r14,rbp

        mov     r15,QWORD[48+rsi]

        mov     ecx,DWORD[56+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[12+rbx],ecx
        cmovle  r15,rbp

        mov     r8,QWORD[64+rsi]

        mov     ecx,DWORD[72+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[16+rbx],ecx
        cmovle  r8,rbp

        mov     r9,QWORD[80+rsi]

        mov     ecx,DWORD[88+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[20+rbx],ecx
        cmovle  r9,rbp

        mov     r10,QWORD[96+rsi]

        mov     ecx,DWORD[104+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[24+rbx],ecx
        cmovle  r10,rbp

        mov     r11,QWORD[112+rsi]

        mov     ecx,DWORD[120+rsi]
        cmp     ecx,edx
        cmovg   edx,ecx
        test    ecx,ecx
        mov     DWORD[28+rbx],ecx
        cmovle  r11,rbp
        vmovdqu ymm0,YMMWORD[rdi]
        lea     rax,[128+rsp]
        vmovdqu ymm1,YMMWORD[32+rdi]
        lea     rbx,[((256+128))+rsp]
        vmovdqu ymm2,YMMWORD[64+rdi]
        vmovdqu ymm3,YMMWORD[96+rdi]
        vmovdqu ymm4,YMMWORD[128+rdi]
        vmovdqu ymm9,YMMWORD[96+rbp]
        jmp     NEAR $L$oop_avx2

ALIGN   32
$L$oop_avx2:
        vmovdqa ymm15,YMMWORD[((-32))+rbp]
        vmovd   xmm10,DWORD[r12]
        lea     r12,[64+r12]
        vmovd   xmm12,DWORD[r8]
        lea     r8,[64+r8]
        vmovd   xmm7,DWORD[r13]
        lea     r13,[64+r13]
        vmovd   xmm6,DWORD[r9]
        lea     r9,[64+r9]
        vpinsrd xmm10,xmm10,DWORD[r14],1
        lea     r14,[64+r14]
        vpinsrd xmm12,xmm12,DWORD[r10],1
        lea     r10,[64+r10]
        vpinsrd xmm7,xmm7,DWORD[r15],1
        lea     r15,[64+r15]
        vpunpckldq      ymm10,ymm10,ymm7
        vpinsrd xmm6,xmm6,DWORD[r11],1
        lea     r11,[64+r11]
        vpunpckldq      ymm12,ymm12,ymm6
        vmovd   xmm11,DWORD[((-60))+r12]
        vinserti128     ymm10,ymm10,xmm12,1
        vmovd   xmm8,DWORD[((-60))+r8]
        vpshufb ymm10,ymm10,ymm9
        vmovd   xmm7,DWORD[((-60))+r13]
        vmovd   xmm6,DWORD[((-60))+r9]
        vpinsrd xmm11,xmm11,DWORD[((-60))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-60))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-60))+r15],1
        vpunpckldq      ymm11,ymm11,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-60))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm4,ymm4,ymm15
        vpslld  ymm7,ymm0,5
        vpandn  ymm6,ymm1,ymm3
        vpand   ymm5,ymm1,ymm2

        vmovdqa YMMWORD[(0-128)+rax],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vinserti128     ymm11,ymm11,xmm8,1
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm12,DWORD[((-56))+r12]

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-56))+r8]
        vpaddd  ymm4,ymm4,ymm5

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpshufb ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vmovd   xmm7,DWORD[((-56))+r13]
        vmovd   xmm6,DWORD[((-56))+r9]
        vpinsrd xmm12,xmm12,DWORD[((-56))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-56))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-56))+r15],1
        vpunpckldq      ymm12,ymm12,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-56))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm3,ymm3,ymm15
        vpslld  ymm7,ymm4,5
        vpandn  ymm6,ymm0,ymm2
        vpand   ymm5,ymm0,ymm1

        vmovdqa YMMWORD[(32-128)+rax],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vinserti128     ymm12,ymm12,xmm8,1
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm13,DWORD[((-52))+r12]

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-52))+r8]
        vpaddd  ymm3,ymm3,ymm5

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpshufb ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vmovd   xmm7,DWORD[((-52))+r13]
        vmovd   xmm6,DWORD[((-52))+r9]
        vpinsrd xmm13,xmm13,DWORD[((-52))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-52))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-52))+r15],1
        vpunpckldq      ymm13,ymm13,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-52))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm2,ymm2,ymm15
        vpslld  ymm7,ymm3,5
        vpandn  ymm6,ymm4,ymm1
        vpand   ymm5,ymm4,ymm0

        vmovdqa YMMWORD[(64-128)+rax],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vinserti128     ymm13,ymm13,xmm8,1
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm14,DWORD[((-48))+r12]

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-48))+r8]
        vpaddd  ymm2,ymm2,ymm5

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpshufb ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vmovd   xmm7,DWORD[((-48))+r13]
        vmovd   xmm6,DWORD[((-48))+r9]
        vpinsrd xmm14,xmm14,DWORD[((-48))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-48))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-48))+r15],1
        vpunpckldq      ymm14,ymm14,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-48))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm1,ymm1,ymm15
        vpslld  ymm7,ymm2,5
        vpandn  ymm6,ymm3,ymm0
        vpand   ymm5,ymm3,ymm4

        vmovdqa YMMWORD[(96-128)+rax],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vinserti128     ymm14,ymm14,xmm8,1
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm10,DWORD[((-44))+r12]

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-44))+r8]
        vpaddd  ymm1,ymm1,ymm5

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpshufb ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vmovd   xmm7,DWORD[((-44))+r13]
        vmovd   xmm6,DWORD[((-44))+r9]
        vpinsrd xmm10,xmm10,DWORD[((-44))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-44))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-44))+r15],1
        vpunpckldq      ymm10,ymm10,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-44))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm0,ymm0,ymm15
        vpslld  ymm7,ymm1,5
        vpandn  ymm6,ymm2,ymm4
        vpand   ymm5,ymm2,ymm3

        vmovdqa YMMWORD[(128-128)+rax],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vinserti128     ymm10,ymm10,xmm8,1
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm11,DWORD[((-40))+r12]

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-40))+r8]
        vpaddd  ymm0,ymm0,ymm5

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpshufb ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vmovd   xmm7,DWORD[((-40))+r13]
        vmovd   xmm6,DWORD[((-40))+r9]
        vpinsrd xmm11,xmm11,DWORD[((-40))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-40))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-40))+r15],1
        vpunpckldq      ymm11,ymm11,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-40))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm4,ymm4,ymm15
        vpslld  ymm7,ymm0,5
        vpandn  ymm6,ymm1,ymm3
        vpand   ymm5,ymm1,ymm2

        vmovdqa YMMWORD[(160-128)+rax],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vinserti128     ymm11,ymm11,xmm8,1
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm12,DWORD[((-36))+r12]

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-36))+r8]
        vpaddd  ymm4,ymm4,ymm5

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpshufb ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vmovd   xmm7,DWORD[((-36))+r13]
        vmovd   xmm6,DWORD[((-36))+r9]
        vpinsrd xmm12,xmm12,DWORD[((-36))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-36))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-36))+r15],1
        vpunpckldq      ymm12,ymm12,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-36))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm3,ymm3,ymm15
        vpslld  ymm7,ymm4,5
        vpandn  ymm6,ymm0,ymm2
        vpand   ymm5,ymm0,ymm1

        vmovdqa YMMWORD[(192-128)+rax],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vinserti128     ymm12,ymm12,xmm8,1
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm13,DWORD[((-32))+r12]

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-32))+r8]
        vpaddd  ymm3,ymm3,ymm5

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpshufb ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vmovd   xmm7,DWORD[((-32))+r13]
        vmovd   xmm6,DWORD[((-32))+r9]
        vpinsrd xmm13,xmm13,DWORD[((-32))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-32))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-32))+r15],1
        vpunpckldq      ymm13,ymm13,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-32))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm2,ymm2,ymm15
        vpslld  ymm7,ymm3,5
        vpandn  ymm6,ymm4,ymm1
        vpand   ymm5,ymm4,ymm0

        vmovdqa YMMWORD[(224-128)+rax],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vinserti128     ymm13,ymm13,xmm8,1
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm14,DWORD[((-28))+r12]

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-28))+r8]
        vpaddd  ymm2,ymm2,ymm5

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpshufb ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vmovd   xmm7,DWORD[((-28))+r13]
        vmovd   xmm6,DWORD[((-28))+r9]
        vpinsrd xmm14,xmm14,DWORD[((-28))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-28))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-28))+r15],1
        vpunpckldq      ymm14,ymm14,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-28))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm1,ymm1,ymm15
        vpslld  ymm7,ymm2,5
        vpandn  ymm6,ymm3,ymm0
        vpand   ymm5,ymm3,ymm4

        vmovdqa YMMWORD[(256-256-128)+rbx],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vinserti128     ymm14,ymm14,xmm8,1
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm10,DWORD[((-24))+r12]

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-24))+r8]
        vpaddd  ymm1,ymm1,ymm5

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpshufb ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vmovd   xmm7,DWORD[((-24))+r13]
        vmovd   xmm6,DWORD[((-24))+r9]
        vpinsrd xmm10,xmm10,DWORD[((-24))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-24))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-24))+r15],1
        vpunpckldq      ymm10,ymm10,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-24))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm0,ymm0,ymm15
        vpslld  ymm7,ymm1,5
        vpandn  ymm6,ymm2,ymm4
        vpand   ymm5,ymm2,ymm3

        vmovdqa YMMWORD[(288-256-128)+rbx],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vinserti128     ymm10,ymm10,xmm8,1
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm11,DWORD[((-20))+r12]

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-20))+r8]
        vpaddd  ymm0,ymm0,ymm5

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpshufb ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vmovd   xmm7,DWORD[((-20))+r13]
        vmovd   xmm6,DWORD[((-20))+r9]
        vpinsrd xmm11,xmm11,DWORD[((-20))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-20))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-20))+r15],1
        vpunpckldq      ymm11,ymm11,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-20))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm4,ymm4,ymm15
        vpslld  ymm7,ymm0,5
        vpandn  ymm6,ymm1,ymm3
        vpand   ymm5,ymm1,ymm2

        vmovdqa YMMWORD[(320-256-128)+rbx],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vinserti128     ymm11,ymm11,xmm8,1
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm12,DWORD[((-16))+r12]

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-16))+r8]
        vpaddd  ymm4,ymm4,ymm5

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpshufb ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vmovd   xmm7,DWORD[((-16))+r13]
        vmovd   xmm6,DWORD[((-16))+r9]
        vpinsrd xmm12,xmm12,DWORD[((-16))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-16))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-16))+r15],1
        vpunpckldq      ymm12,ymm12,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-16))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm3,ymm3,ymm15
        vpslld  ymm7,ymm4,5
        vpandn  ymm6,ymm0,ymm2
        vpand   ymm5,ymm0,ymm1

        vmovdqa YMMWORD[(352-256-128)+rbx],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vinserti128     ymm12,ymm12,xmm8,1
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm13,DWORD[((-12))+r12]

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-12))+r8]
        vpaddd  ymm3,ymm3,ymm5

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpshufb ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vmovd   xmm7,DWORD[((-12))+r13]
        vmovd   xmm6,DWORD[((-12))+r9]
        vpinsrd xmm13,xmm13,DWORD[((-12))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-12))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-12))+r15],1
        vpunpckldq      ymm13,ymm13,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-12))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm2,ymm2,ymm15
        vpslld  ymm7,ymm3,5
        vpandn  ymm6,ymm4,ymm1
        vpand   ymm5,ymm4,ymm0

        vmovdqa YMMWORD[(384-256-128)+rbx],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vinserti128     ymm13,ymm13,xmm8,1
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm14,DWORD[((-8))+r12]

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-8))+r8]
        vpaddd  ymm2,ymm2,ymm5

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpshufb ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vmovd   xmm7,DWORD[((-8))+r13]
        vmovd   xmm6,DWORD[((-8))+r9]
        vpinsrd xmm14,xmm14,DWORD[((-8))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-8))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-8))+r15],1
        vpunpckldq      ymm14,ymm14,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-8))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm1,ymm1,ymm15
        vpslld  ymm7,ymm2,5
        vpandn  ymm6,ymm3,ymm0
        vpand   ymm5,ymm3,ymm4

        vmovdqa YMMWORD[(416-256-128)+rbx],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vinserti128     ymm14,ymm14,xmm8,1
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm6
        vmovd   xmm10,DWORD[((-4))+r12]

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vmovd   xmm8,DWORD[((-4))+r8]
        vpaddd  ymm1,ymm1,ymm5

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpshufb ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vmovdqa ymm11,YMMWORD[((0-128))+rax]
        vmovd   xmm7,DWORD[((-4))+r13]
        vmovd   xmm6,DWORD[((-4))+r9]
        vpinsrd xmm10,xmm10,DWORD[((-4))+r14],1
        vpinsrd xmm8,xmm8,DWORD[((-4))+r10],1
        vpinsrd xmm7,xmm7,DWORD[((-4))+r15],1
        vpunpckldq      ymm10,ymm10,ymm7
        vpinsrd xmm6,xmm6,DWORD[((-4))+r11],1
        vpunpckldq      ymm8,ymm8,ymm6
        vpaddd  ymm0,ymm0,ymm15
        prefetcht0      [63+r12]
        vpslld  ymm7,ymm1,5
        vpandn  ymm6,ymm2,ymm4
        vpand   ymm5,ymm2,ymm3

        vmovdqa YMMWORD[(448-256-128)+rbx],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vinserti128     ymm10,ymm10,xmm8,1
        vpsrld  ymm8,ymm1,27
        prefetcht0      [63+r13]
        vpxor   ymm5,ymm5,ymm6

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        prefetcht0      [63+r14]
        vpaddd  ymm0,ymm0,ymm5

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        prefetcht0      [63+r15]
        vpshufb ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vmovdqa ymm12,YMMWORD[((32-128))+rax]
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((64-128))+rax]

        vpaddd  ymm4,ymm4,ymm15
        vpslld  ymm7,ymm0,5
        vpandn  ymm6,ymm1,ymm3
        prefetcht0      [63+r8]
        vpand   ymm5,ymm1,ymm2

        vmovdqa YMMWORD[(480-256-128)+rbx],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpxor   ymm11,ymm11,YMMWORD[((256-256-128))+rbx]
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm6
        vpxor   ymm11,ymm11,ymm13
        prefetcht0      [63+r9]

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm4,ymm4,ymm5
        prefetcht0      [63+r10]
        vpsrld  ymm9,ymm11,31
        vpaddd  ymm11,ymm11,ymm11

        vpsrld  ymm1,ymm1,2
        prefetcht0      [63+r11]
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((96-128))+rax]

        vpaddd  ymm3,ymm3,ymm15
        vpslld  ymm7,ymm4,5
        vpandn  ymm6,ymm0,ymm2

        vpand   ymm5,ymm0,ymm1

        vmovdqa YMMWORD[(0-128)+rax],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpxor   ymm12,ymm12,YMMWORD[((288-256-128))+rbx]
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm6
        vpxor   ymm12,ymm12,ymm14


        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm3,ymm3,ymm5

        vpsrld  ymm9,ymm12,31
        vpaddd  ymm12,ymm12,ymm12

        vpsrld  ymm0,ymm0,2

        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((128-128))+rax]

        vpaddd  ymm2,ymm2,ymm15
        vpslld  ymm7,ymm3,5
        vpandn  ymm6,ymm4,ymm1

        vpand   ymm5,ymm4,ymm0

        vmovdqa YMMWORD[(32-128)+rax],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpxor   ymm13,ymm13,YMMWORD[((320-256-128))+rbx]
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm6
        vpxor   ymm13,ymm13,ymm10


        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm2,ymm2,ymm5

        vpsrld  ymm9,ymm13,31
        vpaddd  ymm13,ymm13,ymm13

        vpsrld  ymm4,ymm4,2

        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((160-128))+rax]

        vpaddd  ymm1,ymm1,ymm15
        vpslld  ymm7,ymm2,5
        vpandn  ymm6,ymm3,ymm0

        vpand   ymm5,ymm3,ymm4

        vmovdqa YMMWORD[(64-128)+rax],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpxor   ymm14,ymm14,YMMWORD[((352-256-128))+rbx]
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm6
        vpxor   ymm14,ymm14,ymm11


        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm1,ymm1,ymm5

        vpsrld  ymm9,ymm14,31
        vpaddd  ymm14,ymm14,ymm14

        vpsrld  ymm3,ymm3,2

        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((192-128))+rax]

        vpaddd  ymm0,ymm0,ymm15
        vpslld  ymm7,ymm1,5
        vpandn  ymm6,ymm2,ymm4

        vpand   ymm5,ymm2,ymm3

        vmovdqa YMMWORD[(96-128)+rax],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpxor   ymm10,ymm10,YMMWORD[((384-256-128))+rbx]
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm5,ymm6
        vpxor   ymm10,ymm10,ymm12


        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm0,ymm0,ymm5

        vpsrld  ymm9,ymm10,31
        vpaddd  ymm10,ymm10,ymm10

        vpsrld  ymm2,ymm2,2

        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vmovdqa ymm15,YMMWORD[rbp]
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((224-128))+rax]

        vpslld  ymm7,ymm0,5
        vpaddd  ymm4,ymm4,ymm15
        vpxor   ymm5,ymm3,ymm1
        vmovdqa YMMWORD[(128-128)+rax],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpxor   ymm11,ymm11,YMMWORD[((416-256-128))+rbx]
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm2
        vpxor   ymm11,ymm11,ymm13

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm4,ymm4,ymm5
        vpsrld  ymm9,ymm11,31
        vpaddd  ymm11,ymm11,ymm11

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((256-256-128))+rbx]

        vpslld  ymm7,ymm4,5
        vpaddd  ymm3,ymm3,ymm15
        vpxor   ymm5,ymm2,ymm0
        vmovdqa YMMWORD[(160-128)+rax],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpxor   ymm12,ymm12,YMMWORD[((448-256-128))+rbx]
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm1
        vpxor   ymm12,ymm12,ymm14

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm3,ymm3,ymm5
        vpsrld  ymm9,ymm12,31
        vpaddd  ymm12,ymm12,ymm12

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((288-256-128))+rbx]

        vpslld  ymm7,ymm3,5
        vpaddd  ymm2,ymm2,ymm15
        vpxor   ymm5,ymm1,ymm4
        vmovdqa YMMWORD[(192-128)+rax],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpxor   ymm13,ymm13,YMMWORD[((480-256-128))+rbx]
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm0
        vpxor   ymm13,ymm13,ymm10

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm2,ymm2,ymm5
        vpsrld  ymm9,ymm13,31
        vpaddd  ymm13,ymm13,ymm13

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((320-256-128))+rbx]

        vpslld  ymm7,ymm2,5
        vpaddd  ymm1,ymm1,ymm15
        vpxor   ymm5,ymm0,ymm3
        vmovdqa YMMWORD[(224-128)+rax],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpxor   ymm14,ymm14,YMMWORD[((0-128))+rax]
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm4
        vpxor   ymm14,ymm14,ymm11

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm1,ymm1,ymm5
        vpsrld  ymm9,ymm14,31
        vpaddd  ymm14,ymm14,ymm14

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((352-256-128))+rbx]

        vpslld  ymm7,ymm1,5
        vpaddd  ymm0,ymm0,ymm15
        vpxor   ymm5,ymm4,ymm2
        vmovdqa YMMWORD[(256-256-128)+rbx],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpxor   ymm10,ymm10,YMMWORD[((32-128))+rax]
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm5,ymm3
        vpxor   ymm10,ymm10,ymm12

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm0,ymm0,ymm5
        vpsrld  ymm9,ymm10,31
        vpaddd  ymm10,ymm10,ymm10

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((384-256-128))+rbx]

        vpslld  ymm7,ymm0,5
        vpaddd  ymm4,ymm4,ymm15
        vpxor   ymm5,ymm3,ymm1
        vmovdqa YMMWORD[(288-256-128)+rbx],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpxor   ymm11,ymm11,YMMWORD[((64-128))+rax]
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm2
        vpxor   ymm11,ymm11,ymm13

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm4,ymm4,ymm5
        vpsrld  ymm9,ymm11,31
        vpaddd  ymm11,ymm11,ymm11

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((416-256-128))+rbx]

        vpslld  ymm7,ymm4,5
        vpaddd  ymm3,ymm3,ymm15
        vpxor   ymm5,ymm2,ymm0
        vmovdqa YMMWORD[(320-256-128)+rbx],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpxor   ymm12,ymm12,YMMWORD[((96-128))+rax]
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm1
        vpxor   ymm12,ymm12,ymm14

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm3,ymm3,ymm5
        vpsrld  ymm9,ymm12,31
        vpaddd  ymm12,ymm12,ymm12

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((448-256-128))+rbx]

        vpslld  ymm7,ymm3,5
        vpaddd  ymm2,ymm2,ymm15
        vpxor   ymm5,ymm1,ymm4
        vmovdqa YMMWORD[(352-256-128)+rbx],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpxor   ymm13,ymm13,YMMWORD[((128-128))+rax]
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm0
        vpxor   ymm13,ymm13,ymm10

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm2,ymm2,ymm5
        vpsrld  ymm9,ymm13,31
        vpaddd  ymm13,ymm13,ymm13

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((480-256-128))+rbx]

        vpslld  ymm7,ymm2,5
        vpaddd  ymm1,ymm1,ymm15
        vpxor   ymm5,ymm0,ymm3
        vmovdqa YMMWORD[(384-256-128)+rbx],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpxor   ymm14,ymm14,YMMWORD[((160-128))+rax]
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm4
        vpxor   ymm14,ymm14,ymm11

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm1,ymm1,ymm5
        vpsrld  ymm9,ymm14,31
        vpaddd  ymm14,ymm14,ymm14

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((0-128))+rax]

        vpslld  ymm7,ymm1,5
        vpaddd  ymm0,ymm0,ymm15
        vpxor   ymm5,ymm4,ymm2
        vmovdqa YMMWORD[(416-256-128)+rbx],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpxor   ymm10,ymm10,YMMWORD[((192-128))+rax]
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm5,ymm3
        vpxor   ymm10,ymm10,ymm12

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm0,ymm0,ymm5
        vpsrld  ymm9,ymm10,31
        vpaddd  ymm10,ymm10,ymm10

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((32-128))+rax]

        vpslld  ymm7,ymm0,5
        vpaddd  ymm4,ymm4,ymm15
        vpxor   ymm5,ymm3,ymm1
        vmovdqa YMMWORD[(448-256-128)+rbx],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpxor   ymm11,ymm11,YMMWORD[((224-128))+rax]
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm2
        vpxor   ymm11,ymm11,ymm13

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm4,ymm4,ymm5
        vpsrld  ymm9,ymm11,31
        vpaddd  ymm11,ymm11,ymm11

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((64-128))+rax]

        vpslld  ymm7,ymm4,5
        vpaddd  ymm3,ymm3,ymm15
        vpxor   ymm5,ymm2,ymm0
        vmovdqa YMMWORD[(480-256-128)+rbx],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpxor   ymm12,ymm12,YMMWORD[((256-256-128))+rbx]
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm1
        vpxor   ymm12,ymm12,ymm14

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm3,ymm3,ymm5
        vpsrld  ymm9,ymm12,31
        vpaddd  ymm12,ymm12,ymm12

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((96-128))+rax]

        vpslld  ymm7,ymm3,5
        vpaddd  ymm2,ymm2,ymm15
        vpxor   ymm5,ymm1,ymm4
        vmovdqa YMMWORD[(0-128)+rax],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpxor   ymm13,ymm13,YMMWORD[((288-256-128))+rbx]
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm0
        vpxor   ymm13,ymm13,ymm10

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm2,ymm2,ymm5
        vpsrld  ymm9,ymm13,31
        vpaddd  ymm13,ymm13,ymm13

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((128-128))+rax]

        vpslld  ymm7,ymm2,5
        vpaddd  ymm1,ymm1,ymm15
        vpxor   ymm5,ymm0,ymm3
        vmovdqa YMMWORD[(32-128)+rax],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpxor   ymm14,ymm14,YMMWORD[((320-256-128))+rbx]
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm4
        vpxor   ymm14,ymm14,ymm11

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm1,ymm1,ymm5
        vpsrld  ymm9,ymm14,31
        vpaddd  ymm14,ymm14,ymm14

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((160-128))+rax]

        vpslld  ymm7,ymm1,5
        vpaddd  ymm0,ymm0,ymm15
        vpxor   ymm5,ymm4,ymm2
        vmovdqa YMMWORD[(64-128)+rax],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpxor   ymm10,ymm10,YMMWORD[((352-256-128))+rbx]
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm5,ymm3
        vpxor   ymm10,ymm10,ymm12

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm0,ymm0,ymm5
        vpsrld  ymm9,ymm10,31
        vpaddd  ymm10,ymm10,ymm10

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((192-128))+rax]

        vpslld  ymm7,ymm0,5
        vpaddd  ymm4,ymm4,ymm15
        vpxor   ymm5,ymm3,ymm1
        vmovdqa YMMWORD[(96-128)+rax],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpxor   ymm11,ymm11,YMMWORD[((384-256-128))+rbx]
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm2
        vpxor   ymm11,ymm11,ymm13

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm4,ymm4,ymm5
        vpsrld  ymm9,ymm11,31
        vpaddd  ymm11,ymm11,ymm11

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((224-128))+rax]

        vpslld  ymm7,ymm4,5
        vpaddd  ymm3,ymm3,ymm15
        vpxor   ymm5,ymm2,ymm0
        vmovdqa YMMWORD[(128-128)+rax],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpxor   ymm12,ymm12,YMMWORD[((416-256-128))+rbx]
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm1
        vpxor   ymm12,ymm12,ymm14

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm3,ymm3,ymm5
        vpsrld  ymm9,ymm12,31
        vpaddd  ymm12,ymm12,ymm12

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((256-256-128))+rbx]

        vpslld  ymm7,ymm3,5
        vpaddd  ymm2,ymm2,ymm15
        vpxor   ymm5,ymm1,ymm4
        vmovdqa YMMWORD[(160-128)+rax],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpxor   ymm13,ymm13,YMMWORD[((448-256-128))+rbx]
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm0
        vpxor   ymm13,ymm13,ymm10

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm2,ymm2,ymm5
        vpsrld  ymm9,ymm13,31
        vpaddd  ymm13,ymm13,ymm13

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((288-256-128))+rbx]

        vpslld  ymm7,ymm2,5
        vpaddd  ymm1,ymm1,ymm15
        vpxor   ymm5,ymm0,ymm3
        vmovdqa YMMWORD[(192-128)+rax],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpxor   ymm14,ymm14,YMMWORD[((480-256-128))+rbx]
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm4
        vpxor   ymm14,ymm14,ymm11

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm1,ymm1,ymm5
        vpsrld  ymm9,ymm14,31
        vpaddd  ymm14,ymm14,ymm14

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((320-256-128))+rbx]

        vpslld  ymm7,ymm1,5
        vpaddd  ymm0,ymm0,ymm15
        vpxor   ymm5,ymm4,ymm2
        vmovdqa YMMWORD[(224-128)+rax],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpxor   ymm10,ymm10,YMMWORD[((0-128))+rax]
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm5,ymm3
        vpxor   ymm10,ymm10,ymm12

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm0,ymm0,ymm5
        vpsrld  ymm9,ymm10,31
        vpaddd  ymm10,ymm10,ymm10

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vmovdqa ymm15,YMMWORD[32+rbp]
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((352-256-128))+rbx]

        vpaddd  ymm4,ymm4,ymm15
        vpslld  ymm7,ymm0,5
        vpand   ymm6,ymm3,ymm2
        vpxor   ymm11,ymm11,YMMWORD[((32-128))+rax]

        vpaddd  ymm4,ymm4,ymm6
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm3,ymm2
        vpxor   ymm11,ymm11,ymm13

        vmovdqu YMMWORD[(256-256-128)+rbx],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm11,31
        vpand   ymm5,ymm5,ymm1
        vpaddd  ymm11,ymm11,ymm11

        vpslld  ymm6,ymm1,30
        vpaddd  ymm4,ymm4,ymm5

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((384-256-128))+rbx]

        vpaddd  ymm3,ymm3,ymm15
        vpslld  ymm7,ymm4,5
        vpand   ymm6,ymm2,ymm1
        vpxor   ymm12,ymm12,YMMWORD[((64-128))+rax]

        vpaddd  ymm3,ymm3,ymm6
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm2,ymm1
        vpxor   ymm12,ymm12,ymm14

        vmovdqu YMMWORD[(288-256-128)+rbx],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm12,31
        vpand   ymm5,ymm5,ymm0
        vpaddd  ymm12,ymm12,ymm12

        vpslld  ymm6,ymm0,30
        vpaddd  ymm3,ymm3,ymm5

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((416-256-128))+rbx]

        vpaddd  ymm2,ymm2,ymm15
        vpslld  ymm7,ymm3,5
        vpand   ymm6,ymm1,ymm0
        vpxor   ymm13,ymm13,YMMWORD[((96-128))+rax]

        vpaddd  ymm2,ymm2,ymm6
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm1,ymm0
        vpxor   ymm13,ymm13,ymm10

        vmovdqu YMMWORD[(320-256-128)+rbx],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm13,31
        vpand   ymm5,ymm5,ymm4
        vpaddd  ymm13,ymm13,ymm13

        vpslld  ymm6,ymm4,30
        vpaddd  ymm2,ymm2,ymm5

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((448-256-128))+rbx]

        vpaddd  ymm1,ymm1,ymm15
        vpslld  ymm7,ymm2,5
        vpand   ymm6,ymm0,ymm4
        vpxor   ymm14,ymm14,YMMWORD[((128-128))+rax]

        vpaddd  ymm1,ymm1,ymm6
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm0,ymm4
        vpxor   ymm14,ymm14,ymm11

        vmovdqu YMMWORD[(352-256-128)+rbx],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm14,31
        vpand   ymm5,ymm5,ymm3
        vpaddd  ymm14,ymm14,ymm14

        vpslld  ymm6,ymm3,30
        vpaddd  ymm1,ymm1,ymm5

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((480-256-128))+rbx]

        vpaddd  ymm0,ymm0,ymm15
        vpslld  ymm7,ymm1,5
        vpand   ymm6,ymm4,ymm3
        vpxor   ymm10,ymm10,YMMWORD[((160-128))+rax]

        vpaddd  ymm0,ymm0,ymm6
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm4,ymm3
        vpxor   ymm10,ymm10,ymm12

        vmovdqu YMMWORD[(384-256-128)+rbx],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm10,31
        vpand   ymm5,ymm5,ymm2
        vpaddd  ymm10,ymm10,ymm10

        vpslld  ymm6,ymm2,30
        vpaddd  ymm0,ymm0,ymm5

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((0-128))+rax]

        vpaddd  ymm4,ymm4,ymm15
        vpslld  ymm7,ymm0,5
        vpand   ymm6,ymm3,ymm2
        vpxor   ymm11,ymm11,YMMWORD[((192-128))+rax]

        vpaddd  ymm4,ymm4,ymm6
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm3,ymm2
        vpxor   ymm11,ymm11,ymm13

        vmovdqu YMMWORD[(416-256-128)+rbx],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm11,31
        vpand   ymm5,ymm5,ymm1
        vpaddd  ymm11,ymm11,ymm11

        vpslld  ymm6,ymm1,30
        vpaddd  ymm4,ymm4,ymm5

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((32-128))+rax]

        vpaddd  ymm3,ymm3,ymm15
        vpslld  ymm7,ymm4,5
        vpand   ymm6,ymm2,ymm1
        vpxor   ymm12,ymm12,YMMWORD[((224-128))+rax]

        vpaddd  ymm3,ymm3,ymm6
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm2,ymm1
        vpxor   ymm12,ymm12,ymm14

        vmovdqu YMMWORD[(448-256-128)+rbx],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm12,31
        vpand   ymm5,ymm5,ymm0
        vpaddd  ymm12,ymm12,ymm12

        vpslld  ymm6,ymm0,30
        vpaddd  ymm3,ymm3,ymm5

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((64-128))+rax]

        vpaddd  ymm2,ymm2,ymm15
        vpslld  ymm7,ymm3,5
        vpand   ymm6,ymm1,ymm0
        vpxor   ymm13,ymm13,YMMWORD[((256-256-128))+rbx]

        vpaddd  ymm2,ymm2,ymm6
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm1,ymm0
        vpxor   ymm13,ymm13,ymm10

        vmovdqu YMMWORD[(480-256-128)+rbx],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm13,31
        vpand   ymm5,ymm5,ymm4
        vpaddd  ymm13,ymm13,ymm13

        vpslld  ymm6,ymm4,30
        vpaddd  ymm2,ymm2,ymm5

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((96-128))+rax]

        vpaddd  ymm1,ymm1,ymm15
        vpslld  ymm7,ymm2,5
        vpand   ymm6,ymm0,ymm4
        vpxor   ymm14,ymm14,YMMWORD[((288-256-128))+rbx]

        vpaddd  ymm1,ymm1,ymm6
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm0,ymm4
        vpxor   ymm14,ymm14,ymm11

        vmovdqu YMMWORD[(0-128)+rax],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm14,31
        vpand   ymm5,ymm5,ymm3
        vpaddd  ymm14,ymm14,ymm14

        vpslld  ymm6,ymm3,30
        vpaddd  ymm1,ymm1,ymm5

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((128-128))+rax]

        vpaddd  ymm0,ymm0,ymm15
        vpslld  ymm7,ymm1,5
        vpand   ymm6,ymm4,ymm3
        vpxor   ymm10,ymm10,YMMWORD[((320-256-128))+rbx]

        vpaddd  ymm0,ymm0,ymm6
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm4,ymm3
        vpxor   ymm10,ymm10,ymm12

        vmovdqu YMMWORD[(32-128)+rax],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm10,31
        vpand   ymm5,ymm5,ymm2
        vpaddd  ymm10,ymm10,ymm10

        vpslld  ymm6,ymm2,30
        vpaddd  ymm0,ymm0,ymm5

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((160-128))+rax]

        vpaddd  ymm4,ymm4,ymm15
        vpslld  ymm7,ymm0,5
        vpand   ymm6,ymm3,ymm2
        vpxor   ymm11,ymm11,YMMWORD[((352-256-128))+rbx]

        vpaddd  ymm4,ymm4,ymm6
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm3,ymm2
        vpxor   ymm11,ymm11,ymm13

        vmovdqu YMMWORD[(64-128)+rax],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm11,31
        vpand   ymm5,ymm5,ymm1
        vpaddd  ymm11,ymm11,ymm11

        vpslld  ymm6,ymm1,30
        vpaddd  ymm4,ymm4,ymm5

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((192-128))+rax]

        vpaddd  ymm3,ymm3,ymm15
        vpslld  ymm7,ymm4,5
        vpand   ymm6,ymm2,ymm1
        vpxor   ymm12,ymm12,YMMWORD[((384-256-128))+rbx]

        vpaddd  ymm3,ymm3,ymm6
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm2,ymm1
        vpxor   ymm12,ymm12,ymm14

        vmovdqu YMMWORD[(96-128)+rax],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm12,31
        vpand   ymm5,ymm5,ymm0
        vpaddd  ymm12,ymm12,ymm12

        vpslld  ymm6,ymm0,30
        vpaddd  ymm3,ymm3,ymm5

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((224-128))+rax]

        vpaddd  ymm2,ymm2,ymm15
        vpslld  ymm7,ymm3,5
        vpand   ymm6,ymm1,ymm0
        vpxor   ymm13,ymm13,YMMWORD[((416-256-128))+rbx]

        vpaddd  ymm2,ymm2,ymm6
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm1,ymm0
        vpxor   ymm13,ymm13,ymm10

        vmovdqu YMMWORD[(128-128)+rax],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm13,31
        vpand   ymm5,ymm5,ymm4
        vpaddd  ymm13,ymm13,ymm13

        vpslld  ymm6,ymm4,30
        vpaddd  ymm2,ymm2,ymm5

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((256-256-128))+rbx]

        vpaddd  ymm1,ymm1,ymm15
        vpslld  ymm7,ymm2,5
        vpand   ymm6,ymm0,ymm4
        vpxor   ymm14,ymm14,YMMWORD[((448-256-128))+rbx]

        vpaddd  ymm1,ymm1,ymm6
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm0,ymm4
        vpxor   ymm14,ymm14,ymm11

        vmovdqu YMMWORD[(160-128)+rax],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm14,31
        vpand   ymm5,ymm5,ymm3
        vpaddd  ymm14,ymm14,ymm14

        vpslld  ymm6,ymm3,30
        vpaddd  ymm1,ymm1,ymm5

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((288-256-128))+rbx]

        vpaddd  ymm0,ymm0,ymm15
        vpslld  ymm7,ymm1,5
        vpand   ymm6,ymm4,ymm3
        vpxor   ymm10,ymm10,YMMWORD[((480-256-128))+rbx]

        vpaddd  ymm0,ymm0,ymm6
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm4,ymm3
        vpxor   ymm10,ymm10,ymm12

        vmovdqu YMMWORD[(192-128)+rax],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm10,31
        vpand   ymm5,ymm5,ymm2
        vpaddd  ymm10,ymm10,ymm10

        vpslld  ymm6,ymm2,30
        vpaddd  ymm0,ymm0,ymm5

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((320-256-128))+rbx]

        vpaddd  ymm4,ymm4,ymm15
        vpslld  ymm7,ymm0,5
        vpand   ymm6,ymm3,ymm2
        vpxor   ymm11,ymm11,YMMWORD[((0-128))+rax]

        vpaddd  ymm4,ymm4,ymm6
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm3,ymm2
        vpxor   ymm11,ymm11,ymm13

        vmovdqu YMMWORD[(224-128)+rax],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm11,31
        vpand   ymm5,ymm5,ymm1
        vpaddd  ymm11,ymm11,ymm11

        vpslld  ymm6,ymm1,30
        vpaddd  ymm4,ymm4,ymm5

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((352-256-128))+rbx]

        vpaddd  ymm3,ymm3,ymm15
        vpslld  ymm7,ymm4,5
        vpand   ymm6,ymm2,ymm1
        vpxor   ymm12,ymm12,YMMWORD[((32-128))+rax]

        vpaddd  ymm3,ymm3,ymm6
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm2,ymm1
        vpxor   ymm12,ymm12,ymm14

        vmovdqu YMMWORD[(256-256-128)+rbx],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm12,31
        vpand   ymm5,ymm5,ymm0
        vpaddd  ymm12,ymm12,ymm12

        vpslld  ymm6,ymm0,30
        vpaddd  ymm3,ymm3,ymm5

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((384-256-128))+rbx]

        vpaddd  ymm2,ymm2,ymm15
        vpslld  ymm7,ymm3,5
        vpand   ymm6,ymm1,ymm0
        vpxor   ymm13,ymm13,YMMWORD[((64-128))+rax]

        vpaddd  ymm2,ymm2,ymm6
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm1,ymm0
        vpxor   ymm13,ymm13,ymm10

        vmovdqu YMMWORD[(288-256-128)+rbx],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm13,31
        vpand   ymm5,ymm5,ymm4
        vpaddd  ymm13,ymm13,ymm13

        vpslld  ymm6,ymm4,30
        vpaddd  ymm2,ymm2,ymm5

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((416-256-128))+rbx]

        vpaddd  ymm1,ymm1,ymm15
        vpslld  ymm7,ymm2,5
        vpand   ymm6,ymm0,ymm4
        vpxor   ymm14,ymm14,YMMWORD[((96-128))+rax]

        vpaddd  ymm1,ymm1,ymm6
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm0,ymm4
        vpxor   ymm14,ymm14,ymm11

        vmovdqu YMMWORD[(320-256-128)+rbx],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm14,31
        vpand   ymm5,ymm5,ymm3
        vpaddd  ymm14,ymm14,ymm14

        vpslld  ymm6,ymm3,30
        vpaddd  ymm1,ymm1,ymm5

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((448-256-128))+rbx]

        vpaddd  ymm0,ymm0,ymm15
        vpslld  ymm7,ymm1,5
        vpand   ymm6,ymm4,ymm3
        vpxor   ymm10,ymm10,YMMWORD[((128-128))+rax]

        vpaddd  ymm0,ymm0,ymm6
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm4,ymm3
        vpxor   ymm10,ymm10,ymm12

        vmovdqu YMMWORD[(352-256-128)+rbx],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpor    ymm7,ymm7,ymm8
        vpsrld  ymm9,ymm10,31
        vpand   ymm5,ymm5,ymm2
        vpaddd  ymm10,ymm10,ymm10

        vpslld  ymm6,ymm2,30
        vpaddd  ymm0,ymm0,ymm5

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vmovdqa ymm15,YMMWORD[64+rbp]
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((480-256-128))+rbx]

        vpslld  ymm7,ymm0,5
        vpaddd  ymm4,ymm4,ymm15
        vpxor   ymm5,ymm3,ymm1
        vmovdqa YMMWORD[(384-256-128)+rbx],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpxor   ymm11,ymm11,YMMWORD[((160-128))+rax]
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm2
        vpxor   ymm11,ymm11,ymm13

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm4,ymm4,ymm5
        vpsrld  ymm9,ymm11,31
        vpaddd  ymm11,ymm11,ymm11

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((0-128))+rax]

        vpslld  ymm7,ymm4,5
        vpaddd  ymm3,ymm3,ymm15
        vpxor   ymm5,ymm2,ymm0
        vmovdqa YMMWORD[(416-256-128)+rbx],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpxor   ymm12,ymm12,YMMWORD[((192-128))+rax]
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm1
        vpxor   ymm12,ymm12,ymm14

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm3,ymm3,ymm5
        vpsrld  ymm9,ymm12,31
        vpaddd  ymm12,ymm12,ymm12

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((32-128))+rax]

        vpslld  ymm7,ymm3,5
        vpaddd  ymm2,ymm2,ymm15
        vpxor   ymm5,ymm1,ymm4
        vmovdqa YMMWORD[(448-256-128)+rbx],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpxor   ymm13,ymm13,YMMWORD[((224-128))+rax]
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm0
        vpxor   ymm13,ymm13,ymm10

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm2,ymm2,ymm5
        vpsrld  ymm9,ymm13,31
        vpaddd  ymm13,ymm13,ymm13

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((64-128))+rax]

        vpslld  ymm7,ymm2,5
        vpaddd  ymm1,ymm1,ymm15
        vpxor   ymm5,ymm0,ymm3
        vmovdqa YMMWORD[(480-256-128)+rbx],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpxor   ymm14,ymm14,YMMWORD[((256-256-128))+rbx]
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm4
        vpxor   ymm14,ymm14,ymm11

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm1,ymm1,ymm5
        vpsrld  ymm9,ymm14,31
        vpaddd  ymm14,ymm14,ymm14

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((96-128))+rax]

        vpslld  ymm7,ymm1,5
        vpaddd  ymm0,ymm0,ymm15
        vpxor   ymm5,ymm4,ymm2
        vmovdqa YMMWORD[(0-128)+rax],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpxor   ymm10,ymm10,YMMWORD[((288-256-128))+rbx]
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm5,ymm3
        vpxor   ymm10,ymm10,ymm12

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm0,ymm0,ymm5
        vpsrld  ymm9,ymm10,31
        vpaddd  ymm10,ymm10,ymm10

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((128-128))+rax]

        vpslld  ymm7,ymm0,5
        vpaddd  ymm4,ymm4,ymm15
        vpxor   ymm5,ymm3,ymm1
        vmovdqa YMMWORD[(32-128)+rax],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpxor   ymm11,ymm11,YMMWORD[((320-256-128))+rbx]
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm2
        vpxor   ymm11,ymm11,ymm13

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm4,ymm4,ymm5
        vpsrld  ymm9,ymm11,31
        vpaddd  ymm11,ymm11,ymm11

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((160-128))+rax]

        vpslld  ymm7,ymm4,5
        vpaddd  ymm3,ymm3,ymm15
        vpxor   ymm5,ymm2,ymm0
        vmovdqa YMMWORD[(64-128)+rax],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpxor   ymm12,ymm12,YMMWORD[((352-256-128))+rbx]
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm1
        vpxor   ymm12,ymm12,ymm14

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm3,ymm3,ymm5
        vpsrld  ymm9,ymm12,31
        vpaddd  ymm12,ymm12,ymm12

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((192-128))+rax]

        vpslld  ymm7,ymm3,5
        vpaddd  ymm2,ymm2,ymm15
        vpxor   ymm5,ymm1,ymm4
        vmovdqa YMMWORD[(96-128)+rax],ymm12
        vpaddd  ymm2,ymm2,ymm12
        vpxor   ymm13,ymm13,YMMWORD[((384-256-128))+rbx]
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm0
        vpxor   ymm13,ymm13,ymm10

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm2,ymm2,ymm5
        vpsrld  ymm9,ymm13,31
        vpaddd  ymm13,ymm13,ymm13

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((224-128))+rax]

        vpslld  ymm7,ymm2,5
        vpaddd  ymm1,ymm1,ymm15
        vpxor   ymm5,ymm0,ymm3
        vmovdqa YMMWORD[(128-128)+rax],ymm13
        vpaddd  ymm1,ymm1,ymm13
        vpxor   ymm14,ymm14,YMMWORD[((416-256-128))+rbx]
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm4
        vpxor   ymm14,ymm14,ymm11

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm1,ymm1,ymm5
        vpsrld  ymm9,ymm14,31
        vpaddd  ymm14,ymm14,ymm14

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((256-256-128))+rbx]

        vpslld  ymm7,ymm1,5
        vpaddd  ymm0,ymm0,ymm15
        vpxor   ymm5,ymm4,ymm2
        vmovdqa YMMWORD[(160-128)+rax],ymm14
        vpaddd  ymm0,ymm0,ymm14
        vpxor   ymm10,ymm10,YMMWORD[((448-256-128))+rbx]
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm5,ymm3
        vpxor   ymm10,ymm10,ymm12

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm0,ymm0,ymm5
        vpsrld  ymm9,ymm10,31
        vpaddd  ymm10,ymm10,ymm10

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((288-256-128))+rbx]

        vpslld  ymm7,ymm0,5
        vpaddd  ymm4,ymm4,ymm15
        vpxor   ymm5,ymm3,ymm1
        vmovdqa YMMWORD[(192-128)+rax],ymm10
        vpaddd  ymm4,ymm4,ymm10
        vpxor   ymm11,ymm11,YMMWORD[((480-256-128))+rbx]
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm2
        vpxor   ymm11,ymm11,ymm13

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm4,ymm4,ymm5
        vpsrld  ymm9,ymm11,31
        vpaddd  ymm11,ymm11,ymm11

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((320-256-128))+rbx]

        vpslld  ymm7,ymm4,5
        vpaddd  ymm3,ymm3,ymm15
        vpxor   ymm5,ymm2,ymm0
        vmovdqa YMMWORD[(224-128)+rax],ymm11
        vpaddd  ymm3,ymm3,ymm11
        vpxor   ymm12,ymm12,YMMWORD[((0-128))+rax]
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm1
        vpxor   ymm12,ymm12,ymm14

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm3,ymm3,ymm5
        vpsrld  ymm9,ymm12,31
        vpaddd  ymm12,ymm12,ymm12

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((352-256-128))+rbx]

        vpslld  ymm7,ymm3,5
        vpaddd  ymm2,ymm2,ymm15
        vpxor   ymm5,ymm1,ymm4
        vpaddd  ymm2,ymm2,ymm12
        vpxor   ymm13,ymm13,YMMWORD[((32-128))+rax]
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm0
        vpxor   ymm13,ymm13,ymm10

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm2,ymm2,ymm5
        vpsrld  ymm9,ymm13,31
        vpaddd  ymm13,ymm13,ymm13

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((384-256-128))+rbx]

        vpslld  ymm7,ymm2,5
        vpaddd  ymm1,ymm1,ymm15
        vpxor   ymm5,ymm0,ymm3
        vpaddd  ymm1,ymm1,ymm13
        vpxor   ymm14,ymm14,YMMWORD[((64-128))+rax]
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm4
        vpxor   ymm14,ymm14,ymm11

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm1,ymm1,ymm5
        vpsrld  ymm9,ymm14,31
        vpaddd  ymm14,ymm14,ymm14

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpxor   ymm10,ymm10,ymm12
        vmovdqa ymm12,YMMWORD[((416-256-128))+rbx]

        vpslld  ymm7,ymm1,5
        vpaddd  ymm0,ymm0,ymm15
        vpxor   ymm5,ymm4,ymm2
        vpaddd  ymm0,ymm0,ymm14
        vpxor   ymm10,ymm10,YMMWORD[((96-128))+rax]
        vpsrld  ymm8,ymm1,27
        vpxor   ymm5,ymm5,ymm3
        vpxor   ymm10,ymm10,ymm12

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm0,ymm0,ymm5
        vpsrld  ymm9,ymm10,31
        vpaddd  ymm10,ymm10,ymm10

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm10,ymm10,ymm9
        vpor    ymm2,ymm2,ymm6
        vpxor   ymm11,ymm11,ymm13
        vmovdqa ymm13,YMMWORD[((448-256-128))+rbx]

        vpslld  ymm7,ymm0,5
        vpaddd  ymm4,ymm4,ymm15
        vpxor   ymm5,ymm3,ymm1
        vpaddd  ymm4,ymm4,ymm10
        vpxor   ymm11,ymm11,YMMWORD[((128-128))+rax]
        vpsrld  ymm8,ymm0,27
        vpxor   ymm5,ymm5,ymm2
        vpxor   ymm11,ymm11,ymm13

        vpslld  ymm6,ymm1,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm4,ymm4,ymm5
        vpsrld  ymm9,ymm11,31
        vpaddd  ymm11,ymm11,ymm11

        vpsrld  ymm1,ymm1,2
        vpaddd  ymm4,ymm4,ymm7
        vpor    ymm11,ymm11,ymm9
        vpor    ymm1,ymm1,ymm6
        vpxor   ymm12,ymm12,ymm14
        vmovdqa ymm14,YMMWORD[((480-256-128))+rbx]

        vpslld  ymm7,ymm4,5
        vpaddd  ymm3,ymm3,ymm15
        vpxor   ymm5,ymm2,ymm0
        vpaddd  ymm3,ymm3,ymm11
        vpxor   ymm12,ymm12,YMMWORD[((160-128))+rax]
        vpsrld  ymm8,ymm4,27
        vpxor   ymm5,ymm5,ymm1
        vpxor   ymm12,ymm12,ymm14

        vpslld  ymm6,ymm0,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm3,ymm3,ymm5
        vpsrld  ymm9,ymm12,31
        vpaddd  ymm12,ymm12,ymm12

        vpsrld  ymm0,ymm0,2
        vpaddd  ymm3,ymm3,ymm7
        vpor    ymm12,ymm12,ymm9
        vpor    ymm0,ymm0,ymm6
        vpxor   ymm13,ymm13,ymm10
        vmovdqa ymm10,YMMWORD[((0-128))+rax]

        vpslld  ymm7,ymm3,5
        vpaddd  ymm2,ymm2,ymm15
        vpxor   ymm5,ymm1,ymm4
        vpaddd  ymm2,ymm2,ymm12
        vpxor   ymm13,ymm13,YMMWORD[((192-128))+rax]
        vpsrld  ymm8,ymm3,27
        vpxor   ymm5,ymm5,ymm0
        vpxor   ymm13,ymm13,ymm10

        vpslld  ymm6,ymm4,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm2,ymm2,ymm5
        vpsrld  ymm9,ymm13,31
        vpaddd  ymm13,ymm13,ymm13

        vpsrld  ymm4,ymm4,2
        vpaddd  ymm2,ymm2,ymm7
        vpor    ymm13,ymm13,ymm9
        vpor    ymm4,ymm4,ymm6
        vpxor   ymm14,ymm14,ymm11
        vmovdqa ymm11,YMMWORD[((32-128))+rax]

        vpslld  ymm7,ymm2,5
        vpaddd  ymm1,ymm1,ymm15
        vpxor   ymm5,ymm0,ymm3
        vpaddd  ymm1,ymm1,ymm13
        vpxor   ymm14,ymm14,YMMWORD[((224-128))+rax]
        vpsrld  ymm8,ymm2,27
        vpxor   ymm5,ymm5,ymm4
        vpxor   ymm14,ymm14,ymm11

        vpslld  ymm6,ymm3,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm1,ymm1,ymm5
        vpsrld  ymm9,ymm14,31
        vpaddd  ymm14,ymm14,ymm14

        vpsrld  ymm3,ymm3,2
        vpaddd  ymm1,ymm1,ymm7
        vpor    ymm14,ymm14,ymm9
        vpor    ymm3,ymm3,ymm6
        vpslld  ymm7,ymm1,5
        vpaddd  ymm0,ymm0,ymm15
        vpxor   ymm5,ymm4,ymm2

        vpsrld  ymm8,ymm1,27
        vpaddd  ymm0,ymm0,ymm14
        vpxor   ymm5,ymm5,ymm3

        vpslld  ymm6,ymm2,30
        vpor    ymm7,ymm7,ymm8
        vpaddd  ymm0,ymm0,ymm5

        vpsrld  ymm2,ymm2,2
        vpaddd  ymm0,ymm0,ymm7
        vpor    ymm2,ymm2,ymm6
        mov     ecx,1
        lea     rbx,[512+rsp]
        cmp     ecx,DWORD[rbx]
        cmovge  r12,rbp
        cmp     ecx,DWORD[4+rbx]
        cmovge  r13,rbp
        cmp     ecx,DWORD[8+rbx]
        cmovge  r14,rbp
        cmp     ecx,DWORD[12+rbx]
        cmovge  r15,rbp
        cmp     ecx,DWORD[16+rbx]
        cmovge  r8,rbp
        cmp     ecx,DWORD[20+rbx]
        cmovge  r9,rbp
        cmp     ecx,DWORD[24+rbx]
        cmovge  r10,rbp
        cmp     ecx,DWORD[28+rbx]
        cmovge  r11,rbp
        vmovdqu ymm5,YMMWORD[rbx]
        vpxor   ymm7,ymm7,ymm7
        vmovdqa ymm6,ymm5
        vpcmpgtd        ymm6,ymm6,ymm7
        vpaddd  ymm5,ymm5,ymm6

        vpand   ymm0,ymm0,ymm6
        vpand   ymm1,ymm1,ymm6
        vpaddd  ymm0,ymm0,YMMWORD[rdi]
        vpand   ymm2,ymm2,ymm6
        vpaddd  ymm1,ymm1,YMMWORD[32+rdi]
        vpand   ymm3,ymm3,ymm6
        vpaddd  ymm2,ymm2,YMMWORD[64+rdi]
        vpand   ymm4,ymm4,ymm6
        vpaddd  ymm3,ymm3,YMMWORD[96+rdi]
        vpaddd  ymm4,ymm4,YMMWORD[128+rdi]
        vmovdqu YMMWORD[rdi],ymm0
        vmovdqu YMMWORD[32+rdi],ymm1
        vmovdqu YMMWORD[64+rdi],ymm2
        vmovdqu YMMWORD[96+rdi],ymm3
        vmovdqu YMMWORD[128+rdi],ymm4

        vmovdqu YMMWORD[rbx],ymm5
        lea     rbx,[((256+128))+rsp]
        vmovdqu ymm9,YMMWORD[96+rbp]
        dec     edx
        jnz     NEAR $L$oop_avx2







$L$done_avx2:
        mov     rax,QWORD[544+rsp]

        vzeroupper
        movaps  xmm6,XMMWORD[((-216))+rax]
        movaps  xmm7,XMMWORD[((-200))+rax]
        movaps  xmm8,XMMWORD[((-184))+rax]
        movaps  xmm9,XMMWORD[((-168))+rax]
        movaps  xmm10,XMMWORD[((-152))+rax]
        movaps  xmm11,XMMWORD[((-136))+rax]
        movaps  xmm12,XMMWORD[((-120))+rax]
        movaps  xmm13,XMMWORD[((-104))+rax]
        movaps  xmm14,XMMWORD[((-88))+rax]
        movaps  xmm15,XMMWORD[((-72))+rax]
        mov     r15,QWORD[((-48))+rax]

        mov     r14,QWORD[((-40))+rax]

        mov     r13,QWORD[((-32))+rax]

        mov     r12,QWORD[((-24))+rax]

        mov     rbp,QWORD[((-16))+rax]

        mov     rbx,QWORD[((-8))+rax]

        lea     rsp,[rax]

$L$epilogue_avx2:
        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret

$L$SEH_end_sha1_multi_block_avx2:

ALIGN   256
        DD      0x5a827999,0x5a827999,0x5a827999,0x5a827999
        DD      0x5a827999,0x5a827999,0x5a827999,0x5a827999
K_XX_XX:
        DD      0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
        DD      0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
        DD      0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
        DD      0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
        DD      0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
        DD      0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
        DD      0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
        DD      0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
DB      0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
DB      83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107
DB      32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120
DB      56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77
DB      83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110
DB      115,115,108,46,111,114,103,62,0
EXTERN  __imp_RtlVirtualUnwind

ALIGN   16
se_handler:
        push    rsi
        push    rdi
        push    rbx
        push    rbp
        push    r12
        push    r13
        push    r14
        push    r15
        pushfq
        sub     rsp,64

        mov     rax,QWORD[120+r8]
        mov     rbx,QWORD[248+r8]

        mov     rsi,QWORD[8+r9]
        mov     r11,QWORD[56+r9]

        mov     r10d,DWORD[r11]
        lea     r10,[r10*1+rsi]
        cmp     rbx,r10
        jb      NEAR $L$in_prologue

        mov     rax,QWORD[152+r8]

        mov     r10d,DWORD[4+r11]
        lea     r10,[r10*1+rsi]
        cmp     rbx,r10
        jae     NEAR $L$in_prologue

        mov     rax,QWORD[272+rax]

        mov     rbx,QWORD[((-8))+rax]
        mov     rbp,QWORD[((-16))+rax]
        mov     QWORD[144+r8],rbx
        mov     QWORD[160+r8],rbp

        lea     rsi,[((-24-160))+rax]
        lea     rdi,[512+r8]
        mov     ecx,20
        DD      0xa548f3fc

$L$in_prologue:
        mov     rdi,QWORD[8+rax]
        mov     rsi,QWORD[16+rax]
        mov     QWORD[152+r8],rax
        mov     QWORD[168+r8],rsi
        mov     QWORD[176+r8],rdi

        mov     rdi,QWORD[40+r9]
        mov     rsi,r8
        mov     ecx,154
        DD      0xa548f3fc

        mov     rsi,r9
        xor     rcx,rcx
        mov     rdx,QWORD[8+rsi]
        mov     r8,QWORD[rsi]
        mov     r9,QWORD[16+rsi]
        mov     r10,QWORD[40+rsi]
        lea     r11,[56+rsi]
        lea     r12,[24+rsi]
        mov     QWORD[32+rsp],r10
        mov     QWORD[40+rsp],r11
        mov     QWORD[48+rsp],r12
        mov     QWORD[56+rsp],rcx
        call    QWORD[__imp_RtlVirtualUnwind]

        mov     eax,1
        add     rsp,64
        popfq
        pop     r15
        pop     r14
        pop     r13
        pop     r12
        pop     rbp
        pop     rbx
        pop     rdi
        pop     rsi
        DB      0F3h,0C3h               ;repret


ALIGN   16
avx2_handler:
        push    rsi
        push    rdi
        push    rbx
        push    rbp
        push    r12
        push    r13
        push    r14
        push    r15
        pushfq
        sub     rsp,64

        mov     rax,QWORD[120+r8]
        mov     rbx,QWORD[248+r8]

        mov     rsi,QWORD[8+r9]
        mov     r11,QWORD[56+r9]

        mov     r10d,DWORD[r11]
        lea     r10,[r10*1+rsi]
        cmp     rbx,r10
        jb      NEAR $L$in_prologue

        mov     rax,QWORD[152+r8]

        mov     r10d,DWORD[4+r11]
        lea     r10,[r10*1+rsi]
        cmp     rbx,r10
        jae     NEAR $L$in_prologue

        mov     rax,QWORD[544+r8]

        mov     rbx,QWORD[((-8))+rax]
        mov     rbp,QWORD[((-16))+rax]
        mov     r12,QWORD[((-24))+rax]
        mov     r13,QWORD[((-32))+rax]
        mov     r14,QWORD[((-40))+rax]
        mov     r15,QWORD[((-48))+rax]
        mov     QWORD[144+r8],rbx
        mov     QWORD[160+r8],rbp
        mov     QWORD[216+r8],r12
        mov     QWORD[224+r8],r13
        mov     QWORD[232+r8],r14
        mov     QWORD[240+r8],r15

        lea     rsi,[((-56-160))+rax]
        lea     rdi,[512+r8]
        mov     ecx,20
        DD      0xa548f3fc

        jmp     NEAR $L$in_prologue

section .pdata rdata align=4
ALIGN   4
        DD      $L$SEH_begin_sha1_multi_block wrt ..imagebase
        DD      $L$SEH_end_sha1_multi_block wrt ..imagebase
        DD      $L$SEH_info_sha1_multi_block wrt ..imagebase
        DD      $L$SEH_begin_sha1_multi_block_shaext wrt ..imagebase
        DD      $L$SEH_end_sha1_multi_block_shaext wrt ..imagebase
        DD      $L$SEH_info_sha1_multi_block_shaext wrt ..imagebase
        DD      $L$SEH_begin_sha1_multi_block_avx wrt ..imagebase
        DD      $L$SEH_end_sha1_multi_block_avx wrt ..imagebase
        DD      $L$SEH_info_sha1_multi_block_avx wrt ..imagebase
        DD      $L$SEH_begin_sha1_multi_block_avx2 wrt ..imagebase
        DD      $L$SEH_end_sha1_multi_block_avx2 wrt ..imagebase
        DD      $L$SEH_info_sha1_multi_block_avx2 wrt ..imagebase
section .xdata rdata align=8
ALIGN   8
$L$SEH_info_sha1_multi_block:
DB      9,0,0,0
        DD      se_handler wrt ..imagebase
        DD      $L$body wrt ..imagebase,$L$epilogue wrt ..imagebase
$L$SEH_info_sha1_multi_block_shaext:
DB      9,0,0,0
        DD      se_handler wrt ..imagebase
        DD      $L$body_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase
$L$SEH_info_sha1_multi_block_avx:
DB      9,0,0,0
        DD      se_handler wrt ..imagebase
        DD      $L$body_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
$L$SEH_info_sha1_multi_block_avx2:
DB      9,0,0,0
        DD      avx2_handler wrt ..imagebase
        DD      $L$body_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
